av1_dec_fuzzer.cc:_ZL47aom_codec_control_typechecked_AV1_SET_TILE_MODEP13aom_codec_ctxij:
  555|  16.1k|      aom_codec_ctx_t *ctx, int ctrl, typ data) {            \
  556|  16.1k|    return aom_codec_control(ctx, ctrl, data);               \
  557|  16.1k|  } /**<\hideinitializer*/                                   \
av1_dec_fuzzer.cc:_ZL49aom_codec_control_typechecked_AV1D_EXT_TILE_DEBUGP13aom_codec_ctxij:
  555|  16.1k|      aom_codec_ctx_t *ctx, int ctrl, typ data) {            \
  556|  16.1k|    return aom_codec_control(ctx, ctrl, data);               \
  557|  16.1k|  } /**<\hideinitializer*/                                   \
av1_dec_fuzzer.cc:_ZL48aom_codec_control_typechecked_AV1D_SET_IS_ANNEXBP13aom_codec_ctxij:
  555|  16.1k|      aom_codec_ctx_t *ctx, int ctrl, typ data) {            \
  556|  16.1k|    return aom_codec_control(ctx, ctrl, data);               \
  557|  16.1k|  } /**<\hideinitializer*/                                   \
av1_dec_fuzzer.cc:_ZL56aom_codec_control_typechecked_AV1D_SET_OUTPUT_ALL_LAYERSP13aom_codec_ctxii:
  555|  16.1k|      aom_codec_ctx_t *ctx, int ctrl, typ data) {            \
  556|  16.1k|    return aom_codec_control(ctx, ctrl, data);               \
  557|  16.1k|  } /**<\hideinitializer*/                                   \
av1_dec_fuzzer.cc:_ZL54aom_codec_control_typechecked_AV1D_SET_OPERATING_POINTP13aom_codec_ctxii:
  555|  16.1k|      aom_codec_ctx_t *ctx, int ctrl, typ data) {            \
  556|  16.1k|    return aom_codec_control(ctx, ctrl, data);               \
  557|  16.1k|  } /**<\hideinitializer*/                                   \

aom_codec.c:at_ctrl_map_end:
  192|   825k|static inline int at_ctrl_map_end(aom_codec_ctrl_fn_map_t *e) {
  193|   825k|  return e->ctrl_id == 0 && e->fn == NULL;
  ------------------
  |  Branch (193:10): [True: 0, False: 825k]
  |  Branch (193:29): [True: 0, False: 0]
  ------------------
  194|   825k|}

aom_codec_destroy:
   68|  16.1k|aom_codec_err_t aom_codec_destroy(aom_codec_ctx_t *ctx) {
   69|  16.1k|  if (!ctx) {
  ------------------
  |  Branch (69:7): [True: 0, False: 16.1k]
  ------------------
   70|      0|    return AOM_CODEC_INVALID_PARAM;
   71|      0|  }
   72|  16.1k|  if (!ctx->iface || !ctx->priv) {
  ------------------
  |  Branch (72:7): [True: 0, False: 16.1k]
  |  Branch (72:22): [True: 0, False: 16.1k]
  ------------------
   73|      0|    ctx->err = AOM_CODEC_ERROR;
   74|      0|    return AOM_CODEC_ERROR;
   75|      0|  }
   76|  16.1k|  ctx->iface->destroy((aom_codec_alg_priv_t *)ctx->priv);
   77|  16.1k|  ctx->iface = NULL;
   78|  16.1k|  ctx->name = NULL;
   79|  16.1k|  ctx->priv = NULL;
   80|  16.1k|  ctx->err = AOM_CODEC_OK;
   81|  16.1k|  return AOM_CODEC_OK;
   82|  16.1k|}
aom_codec_control:
   88|  80.9k|aom_codec_err_t aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...) {
   89|  80.9k|  if (!ctx) {
  ------------------
  |  Branch (89:7): [True: 0, False: 80.9k]
  ------------------
   90|      0|    return AOM_CODEC_INVALID_PARAM;
   91|      0|  }
   92|       |  // Control ID must be non-zero.
   93|  80.9k|  if (!ctrl_id) {
  ------------------
  |  Branch (93:7): [True: 0, False: 80.9k]
  ------------------
   94|      0|    ctx->err = AOM_CODEC_INVALID_PARAM;
   95|      0|    return AOM_CODEC_INVALID_PARAM;
   96|      0|  }
   97|  80.9k|  if (!ctx->iface || !ctx->priv || !ctx->iface->ctrl_maps) {
  ------------------
  |  Branch (97:7): [True: 0, False: 80.9k]
  |  Branch (97:22): [True: 0, False: 80.9k]
  |  Branch (97:36): [True: 0, False: 80.9k]
  ------------------
   98|      0|    ctx->err = AOM_CODEC_ERROR;
   99|      0|    return AOM_CODEC_ERROR;
  100|      0|  }
  101|       |
  102|       |  // "ctrl_maps" is an array of (control ID, function pointer) elements,
  103|       |  // with CTRL_MAP_END as a sentinel.
  104|  80.9k|  for (aom_codec_ctrl_fn_map_t *entry = ctx->iface->ctrl_maps;
  105|   825k|       !at_ctrl_map_end(entry); ++entry) {
  ------------------
  |  Branch (105:8): [True: 825k, False: 0]
  ------------------
  106|   825k|    if (entry->ctrl_id == ctrl_id) {
  ------------------
  |  Branch (106:9): [True: 80.9k, False: 744k]
  ------------------
  107|  80.9k|      va_list ap;
  108|  80.9k|      va_start(ap, ctrl_id);
  109|  80.9k|      ctx->err = entry->fn((aom_codec_alg_priv_t *)ctx->priv, ap);
  110|  80.9k|      va_end(ap);
  111|  80.9k|      return ctx->err;
  112|  80.9k|    }
  113|   825k|  }
  114|      0|  ctx->err = AOM_CODEC_ERROR;
  115|      0|  ctx->priv->err_detail = "Invalid control ID";
  116|      0|  return AOM_CODEC_ERROR;
  117|  80.9k|}
aom_internal_error:
  160|   147k|                        aom_codec_err_t error, const char *fmt, ...) {
  161|   147k|  va_list ap;
  162|       |
  163|   147k|  va_start(ap, fmt);
  164|   147k|  set_error(info, error, fmt, ap);
  165|   147k|  va_end(ap);
  166|       |
  167|   147k|  if (info->setjmp) longjmp(info->jmp, info->error_code);
  ------------------
  |  Branch (167:7): [True: 147k, False: 18.4E]
  ------------------
  168|   147k|}
aom_merge_corrupted_flag:
  182|  4.89M|void aom_merge_corrupted_flag(int *corrupted, int value) {
  183|  4.89M|  *corrupted |= value;
  184|  4.89M|}
aom_codec.c:set_error:
  135|   147k|                      aom_codec_err_t error, const char *fmt, va_list ap) {
  136|   147k|  info->error_code = error;
  137|   147k|  info->has_detail = 0;
  138|       |
  139|   147k|  if (fmt) {
  ------------------
  |  Branch (139:7): [True: 147k, False: 4]
  ------------------
  140|   147k|    size_t sz = sizeof(info->detail);
  141|       |
  142|   147k|    info->has_detail = 1;
  143|   147k|    vsnprintf(info->detail, sz - 1, fmt, ap);
  144|   147k|    info->detail[sz - 1] = '\0';
  145|   147k|  }
  146|   147k|}

aom_codec_dec_init_ver:
   28|  16.1k|                                       aom_codec_flags_t flags, int ver) {
   29|  16.1k|  aom_codec_err_t res;
   30|       |
   31|  16.1k|  if (ver != AOM_DECODER_ABI_VERSION)
  ------------------
  |  |   45|  16.1k|  (6 + AOM_CODEC_ABI_VERSION) /**<\hideinitializer*/
  |  |  ------------------
  |  |  |  |  152|  16.1k|#define AOM_CODEC_ABI_VERSION (7 + AOM_IMAGE_ABI_VERSION) /**<\hideinitializer*/
  |  |  |  |  ------------------
  |  |  |  |  |  |   33|  16.1k|#define AOM_IMAGE_ABI_VERSION (9) /**<\hideinitializer*/
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  |  Branch (31:7): [True: 0, False: 16.1k]
  ------------------
   32|      0|    res = AOM_CODEC_ABI_MISMATCH;
   33|  16.1k|  else if (!ctx || !iface)
  ------------------
  |  Branch (33:12): [True: 0, False: 16.1k]
  |  Branch (33:20): [True: 0, False: 16.1k]
  ------------------
   34|      0|    res = AOM_CODEC_INVALID_PARAM;
   35|  16.1k|  else if (iface->abi_version != AOM_CODEC_INTERNAL_ABI_VERSION)
  ------------------
  |  |   65|  16.1k|#define AOM_CODEC_INTERNAL_ABI_VERSION (7) /**<\hideinitializer*/
  ------------------
  |  Branch (35:12): [True: 0, False: 16.1k]
  ------------------
   36|      0|    res = AOM_CODEC_ABI_MISMATCH;
   37|  16.1k|  else if (!(iface->caps & AOM_CODEC_CAP_DECODER))
  ------------------
  |  |  218|  16.1k|#define AOM_CODEC_CAP_DECODER 0x1 /**< Is a decoder */
  ------------------
  |  Branch (37:12): [True: 0, False: 16.1k]
  ------------------
   38|      0|    res = AOM_CODEC_INCAPABLE;
   39|  16.1k|  else {
   40|  16.1k|    memset(ctx, 0, sizeof(*ctx));
   41|  16.1k|    ctx->iface = iface;
   42|  16.1k|    ctx->name = iface->name;
   43|  16.1k|    ctx->priv = NULL;
   44|  16.1k|    ctx->init_flags = flags;
   45|  16.1k|    ctx->config.dec = cfg;
   46|       |
   47|  16.1k|    res = ctx->iface->init(ctx);
   48|  16.1k|    if (res) {
  ------------------
  |  Branch (48:9): [True: 0, False: 16.1k]
  ------------------
   49|      0|      ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
  ------------------
  |  Branch (49:25): [True: 0, False: 0]
  ------------------
   50|      0|      aom_codec_destroy(ctx);
   51|      0|    }
   52|  16.1k|  }
   53|       |
   54|  16.1k|  return SAVE_STATUS(ctx, res);
  ------------------
  |  |   19|  16.1k|#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
  |  |  ------------------
  |  |  |  Branch (19:32): [True: 16.1k, False: 0]
  |  |  ------------------
  ------------------
   55|  16.1k|}
aom_codec_peek_stream_info:
   59|   292k|                                           aom_codec_stream_info_t *si) {
   60|   292k|  aom_codec_err_t res;
   61|       |
   62|   292k|  if (!iface || !data || !data_sz || !si) {
  ------------------
  |  Branch (62:7): [True: 0, False: 292k]
  |  Branch (62:17): [True: 0, False: 292k]
  |  Branch (62:26): [True: 0, False: 292k]
  |  Branch (62:38): [True: 0, False: 292k]
  ------------------
   63|      0|    res = AOM_CODEC_INVALID_PARAM;
   64|   292k|  } else {
   65|       |    /* Set default/unknown values */
   66|   292k|    si->w = 0;
   67|   292k|    si->h = 0;
   68|       |
   69|   292k|    res = iface->dec.peek_si(data, data_sz, si);
   70|   292k|  }
   71|       |
   72|   292k|  return res;
   73|   292k|}
aom_codec_decode:
   95|   292k|                                 size_t data_sz, void *user_priv) {
   96|   292k|  aom_codec_err_t res;
   97|       |
   98|   292k|  if (!ctx)
  ------------------
  |  Branch (98:7): [True: 0, False: 292k]
  ------------------
   99|      0|    res = AOM_CODEC_INVALID_PARAM;
  100|   292k|  else if (!ctx->iface || !ctx->priv)
  ------------------
  |  Branch (100:12): [True: 0, False: 292k]
  |  Branch (100:27): [True: 0, False: 292k]
  ------------------
  101|      0|    res = AOM_CODEC_ERROR;
  102|   292k|  else {
  103|   292k|    res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv);
  104|   292k|  }
  105|       |
  106|   292k|  return SAVE_STATUS(ctx, res);
  ------------------
  |  |   19|   292k|#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
  |  |  ------------------
  |  |  |  Branch (19:32): [True: 292k, False: 0]
  |  |  ------------------
  ------------------
  107|   292k|}
aom_codec_get_frame:
  109|   354k|aom_image_t *aom_codec_get_frame(aom_codec_ctx_t *ctx, aom_codec_iter_t *iter) {
  110|   354k|  aom_image_t *img;
  111|       |
  112|   354k|  if (!ctx || !iter || !ctx->iface || !ctx->priv)
  ------------------
  |  Branch (112:7): [True: 0, False: 354k]
  |  Branch (112:15): [True: 0, False: 354k]
  |  Branch (112:24): [True: 0, False: 354k]
  |  Branch (112:39): [True: 0, False: 354k]
  ------------------
  113|      0|    img = NULL;
  114|   354k|  else
  115|   354k|    img = ctx->iface->dec.get_frame(get_alg_priv(ctx), iter);
  116|       |
  117|   354k|  return img;
  118|   354k|}
aom_decoder.c:get_alg_priv:
   21|   646k|static aom_codec_alg_priv_t *get_alg_priv(aom_codec_ctx_t *ctx) {
   22|   646k|  return (aom_codec_alg_priv_t *)ctx->priv;
   23|   646k|}

aom_img_alloc_with_cb:
  211|  14.6k|                                   void *cb_priv) {
  212|  14.6k|  return img_alloc_helper(img, fmt, d_w, d_h, align, align, 1, 0, NULL,
  213|  14.6k|                          alloc_cb, cb_priv);
  214|  14.6k|}
aom_img_set_rect:
  235|  14.6k|                     unsigned int w, unsigned int h, unsigned int border) {
  236|  14.6k|  if (x <= UINT_MAX - w && x + w <= img->w && y <= UINT_MAX - h &&
  ------------------
  |  Branch (236:7): [True: 14.6k, False: 0]
  |  Branch (236:28): [True: 14.6k, False: 0]
  |  Branch (236:47): [True: 14.6k, False: 0]
  ------------------
  237|  14.6k|      y + h <= img->h) {
  ------------------
  |  Branch (237:7): [True: 14.6k, False: 0]
  ------------------
  238|  14.6k|    img->d_w = w;
  239|  14.6k|    img->d_h = h;
  240|       |
  241|  14.6k|    x += border;
  242|  14.6k|    y += border;
  243|       |
  244|       |    /* Calculate plane pointers */
  245|  14.6k|    if (!(img->fmt & AOM_IMG_FMT_PLANAR)) {
  ------------------
  |  |   35|  14.6k|#define AOM_IMG_FMT_PLANAR 0x100  /**< Image is a planar format. */
  ------------------
  |  Branch (245:9): [True: 0, False: 14.6k]
  ------------------
  246|      0|      img->planes[AOM_PLANE_PACKED] =
  ------------------
  |  |  225|      0|#define AOM_PLANE_PACKED 0 /**< To be used for all packed formats */
  ------------------
  247|      0|          img->img_data + x * img->bps / 8 + y * img->stride[AOM_PLANE_PACKED];
  ------------------
  |  |  225|      0|#define AOM_PLANE_PACKED 0 /**< To be used for all packed formats */
  ------------------
  248|  14.6k|    } else {
  249|  14.6k|      const int bytes_per_sample =
  250|  14.6k|          (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
  ------------------
  |  |   38|  14.6k|#define AOM_IMG_FMT_HIGHBITDEPTH 0x800 /**< Image uses 16bit framebuffer. */
  ------------------
  |  Branch (250:11): [True: 9.68k, False: 4.94k]
  ------------------
  251|  14.6k|      unsigned char *data = img->img_data;
  252|       |
  253|  14.6k|      img->planes[AOM_PLANE_Y] =
  ------------------
  |  |  226|  14.6k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  254|  14.6k|          data + x * bytes_per_sample + y * img->stride[AOM_PLANE_Y];
  ------------------
  |  |  226|  14.6k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  255|  14.6k|      data += ((size_t)img->h + 2 * border) * img->stride[AOM_PLANE_Y];
  ------------------
  |  |  226|  14.6k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  256|       |
  257|  14.6k|      unsigned int uv_border_h = border >> img->y_chroma_shift;
  258|  14.6k|      unsigned int uv_x = x >> img->x_chroma_shift;
  259|  14.6k|      unsigned int uv_y = y >> img->y_chroma_shift;
  260|  14.6k|      if (img->fmt == AOM_IMG_FMT_NV12) {
  ------------------
  |  Branch (260:11): [True: 0, False: 14.6k]
  ------------------
  261|      0|        img->planes[AOM_PLANE_U] = data + uv_x * bytes_per_sample * 2 +
  ------------------
  |  |  227|      0|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
  262|      0|                                   uv_y * img->stride[AOM_PLANE_U];
  ------------------
  |  |  227|      0|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
  263|      0|        img->planes[AOM_PLANE_V] = NULL;
  ------------------
  |  |  228|      0|#define AOM_PLANE_V 2      /**< V (Chroma) plane */
  ------------------
  264|  14.6k|      } else if (!(img->fmt & AOM_IMG_FMT_UV_FLIP)) {
  ------------------
  |  |   36|  14.6k|#define AOM_IMG_FMT_UV_FLIP 0x200 /**< V plane precedes U in memory. */
  ------------------
  |  Branch (264:18): [True: 14.6k, False: 0]
  ------------------
  265|  14.6k|        img->planes[AOM_PLANE_U] =
  ------------------
  |  |  227|  14.6k|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
  266|  14.6k|            data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_U];
  ------------------
  |  |  227|  14.6k|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
  267|  14.6k|        data += ((size_t)(img->h >> img->y_chroma_shift) + 2 * uv_border_h) *
  268|  14.6k|                img->stride[AOM_PLANE_U];
  ------------------
  |  |  227|  14.6k|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
  269|  14.6k|        img->planes[AOM_PLANE_V] =
  ------------------
  |  |  228|  14.6k|#define AOM_PLANE_V 2      /**< V (Chroma) plane */
  ------------------
  270|  14.6k|            data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_V];
  ------------------
  |  |  228|  14.6k|#define AOM_PLANE_V 2      /**< V (Chroma) plane */
  ------------------
  271|  14.6k|      } else {
  272|      0|        img->planes[AOM_PLANE_V] =
  ------------------
  |  |  228|      0|#define AOM_PLANE_V 2      /**< V (Chroma) plane */
  ------------------
  273|      0|            data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_V];
  ------------------
  |  |  228|      0|#define AOM_PLANE_V 2      /**< V (Chroma) plane */
  ------------------
  274|      0|        data += ((size_t)(img->h >> img->y_chroma_shift) + 2 * uv_border_h) *
  275|      0|                img->stride[AOM_PLANE_V];
  ------------------
  |  |  228|      0|#define AOM_PLANE_V 2      /**< V (Chroma) plane */
  ------------------
  276|      0|        img->planes[AOM_PLANE_U] =
  ------------------
  |  |  227|      0|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
  277|      0|            data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_U];
  ------------------
  |  |  227|      0|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
  278|      0|      }
  279|  14.6k|    }
  280|  14.6k|    return 0;
  281|  14.6k|  }
  282|      0|  return -1;
  283|  14.6k|}
aom_img_free:
  304|  16.1k|void aom_img_free(aom_image_t *img) {
  305|  16.1k|  if (img) {
  ------------------
  |  Branch (305:7): [True: 16.1k, False: 0]
  ------------------
  306|  16.1k|    aom_img_remove_metadata(img);
  307|  16.1k|    if (img->img_data && img->img_data_owner) aom_free(img->img_data);
  ------------------
  |  Branch (307:9): [True: 1.97k, False: 14.2k]
  |  Branch (307:26): [True: 0, False: 1.97k]
  ------------------
  308|       |
  309|  16.1k|    if (img->self_allocd) free(img);
  ------------------
  |  Branch (309:9): [True: 0, False: 16.1k]
  ------------------
  310|  16.1k|  }
  311|  16.1k|}
aom_img_metadata_alloc:
  329|  1.07k|    aom_metadata_insert_flags_t insert_flag) {
  330|  1.07k|  if (!data || sz == 0) return NULL;
  ------------------
  |  Branch (330:7): [True: 0, False: 1.07k]
  |  Branch (330:16): [True: 0, False: 1.07k]
  ------------------
  331|  1.07k|  aom_metadata_t *metadata = (aom_metadata_t *)malloc(sizeof(aom_metadata_t));
  332|  1.07k|  if (!metadata) return NULL;
  ------------------
  |  Branch (332:7): [True: 0, False: 1.07k]
  ------------------
  333|  1.07k|  metadata->type = type;
  334|  1.07k|  metadata->payload = (uint8_t *)malloc(sz);
  335|  1.07k|  if (!metadata->payload) {
  ------------------
  |  Branch (335:7): [True: 0, False: 1.07k]
  ------------------
  336|      0|    free(metadata);
  337|      0|    return NULL;
  338|      0|  }
  339|  1.07k|  memcpy(metadata->payload, data, sz);
  340|  1.07k|  metadata->sz = sz;
  341|  1.07k|  metadata->insert_flag = insert_flag;
  342|  1.07k|  return metadata;
  343|  1.07k|}
aom_img_metadata_free:
  345|  1.07k|void aom_img_metadata_free(aom_metadata_t *metadata) {
  346|  1.07k|  if (metadata) {
  ------------------
  |  Branch (346:7): [True: 1.07k, False: 0]
  ------------------
  347|  1.07k|    if (metadata->payload) free(metadata->payload);
  ------------------
  |  Branch (347:9): [True: 1.07k, False: 0]
  ------------------
  348|  1.07k|    free(metadata);
  349|  1.07k|  }
  350|  1.07k|}
aom_img_metadata_array_alloc:
  352|    118|aom_metadata_array_t *aom_img_metadata_array_alloc(size_t sz) {
  353|    118|  aom_metadata_array_t *arr =
  354|    118|      (aom_metadata_array_t *)calloc(1, sizeof(aom_metadata_array_t));
  355|    118|  if (!arr) return NULL;
  ------------------
  |  Branch (355:7): [True: 0, False: 118]
  ------------------
  356|    118|  if (sz > 0) {
  ------------------
  |  Branch (356:7): [True: 0, False: 118]
  ------------------
  357|      0|    arr->metadata_array =
  358|      0|        (aom_metadata_t **)calloc(sz, sizeof(aom_metadata_t *));
  359|      0|    if (!arr->metadata_array) {
  ------------------
  |  Branch (359:9): [True: 0, False: 0]
  ------------------
  360|      0|      aom_img_metadata_array_free(arr);
  361|      0|      return NULL;
  362|      0|    }
  363|      0|    arr->sz = sz;
  364|      0|  }
  365|    118|  return arr;
  366|    118|}
aom_img_metadata_array_free:
  368|  16.1k|void aom_img_metadata_array_free(aom_metadata_array_t *arr) {
  369|  16.1k|  if (arr) {
  ------------------
  |  Branch (369:7): [True: 118, False: 16.0k]
  ------------------
  370|    118|    if (arr->metadata_array) {
  ------------------
  |  Branch (370:9): [True: 118, False: 0]
  ------------------
  371|  1.19k|      for (size_t i = 0; i < arr->sz; i++) {
  ------------------
  |  Branch (371:26): [True: 1.07k, False: 118]
  ------------------
  372|  1.07k|        aom_img_metadata_free(arr->metadata_array[i]);
  373|  1.07k|      }
  374|    118|      free(arr->metadata_array);
  375|    118|    }
  376|    118|    free(arr);
  377|    118|  }
  378|  16.1k|}
aom_img_remove_metadata:
  412|  92.3k|void aom_img_remove_metadata(aom_image_t *img) {
  413|  92.3k|  if (img && img->metadata) {
  ------------------
  |  Branch (413:7): [True: 92.3k, False: 0]
  |  Branch (413:14): [True: 44, False: 92.2k]
  ------------------
  414|     44|    aom_img_metadata_array_free(img->metadata);
  415|     44|    img->metadata = NULL;
  416|     44|  }
  417|  92.3k|}
aom_image.c:img_alloc_helper:
   37|  14.6k|    aom_alloc_img_data_cb_fn_t alloc_cb, void *cb_priv) {
   38|       |  /* NOTE: In this function, bit_depth is either 8 or 16 (if
   39|       |   * AOM_IMG_FMT_HIGHBITDEPTH is set), never 10 or 12.
   40|       |   */
   41|  14.6k|  unsigned int xcs, ycs, bps, bit_depth;
   42|       |
   43|  14.6k|  if (img != NULL) memset(img, 0, sizeof(aom_image_t));
  ------------------
  |  Branch (43:7): [True: 14.6k, False: 0]
  ------------------
   44|       |
   45|  14.6k|  if (fmt == AOM_IMG_FMT_NONE) goto fail;
  ------------------
  |  Branch (45:7): [True: 0, False: 14.6k]
  ------------------
   46|       |
   47|       |  /* Impose maximum values on input parameters so that this function can
   48|       |   * perform arithmetic operations without worrying about overflows.
   49|       |   */
   50|  14.6k|  if (d_w > 0x08000000 || d_h > 0x08000000 || buf_align > 65536 ||
  ------------------
  |  Branch (50:7): [True: 0, False: 14.6k]
  |  Branch (50:27): [True: 0, False: 14.6k]
  |  Branch (50:47): [True: 0, False: 14.6k]
  ------------------
   51|  14.6k|      stride_align > 65536 || size_align > 65536 || border > 65536) {
  ------------------
  |  Branch (51:7): [True: 0, False: 14.6k]
  |  Branch (51:31): [True: 0, False: 14.6k]
  |  Branch (51:53): [True: 0, False: 14.6k]
  ------------------
   52|      0|    goto fail;
   53|      0|  }
   54|       |
   55|       |  /* Treat align==0 like align==1 */
   56|  14.6k|  if (!buf_align) buf_align = 1;
  ------------------
  |  Branch (56:7): [True: 0, False: 14.6k]
  ------------------
   57|       |
   58|       |  /* Validate alignment (must be power of 2) */
   59|  14.6k|  if (buf_align & (buf_align - 1)) goto fail;
  ------------------
  |  Branch (59:7): [True: 0, False: 14.6k]
  ------------------
   60|       |
   61|       |  /* Treat align==0 like align==1 */
   62|  14.6k|  if (!stride_align) stride_align = 1;
  ------------------
  |  Branch (62:7): [True: 0, False: 14.6k]
  ------------------
   63|       |
   64|       |  /* Validate alignment (must be power of 2) */
   65|  14.6k|  if (stride_align & (stride_align - 1)) goto fail;
  ------------------
  |  Branch (65:7): [True: 0, False: 14.6k]
  ------------------
   66|       |
   67|       |  /* Treat align==0 like align==1 */
   68|  14.6k|  if (!size_align) size_align = 1;
  ------------------
  |  Branch (68:7): [True: 0, False: 14.6k]
  ------------------
   69|       |
   70|       |  /* Validate alignment (must be power of 2) */
   71|  14.6k|  if (size_align & (size_align - 1)) goto fail;
  ------------------
  |  Branch (71:7): [True: 0, False: 14.6k]
  ------------------
   72|       |
   73|       |  /* Get sample size for this format */
   74|  14.6k|  switch (fmt) {
   75|  1.71k|    case AOM_IMG_FMT_I420:
  ------------------
  |  Branch (75:5): [True: 1.71k, False: 12.9k]
  ------------------
   76|  1.71k|    case AOM_IMG_FMT_YV12:
  ------------------
  |  Branch (76:5): [True: 0, False: 14.6k]
  ------------------
   77|  1.71k|    case AOM_IMG_FMT_NV12:
  ------------------
  |  Branch (77:5): [True: 0, False: 14.6k]
  ------------------
   78|  1.71k|    case AOM_IMG_FMT_AOMI420:
  ------------------
  |  Branch (78:5): [True: 0, False: 14.6k]
  ------------------
   79|  1.71k|    case AOM_IMG_FMT_AOMYV12: bps = 12; break;
  ------------------
  |  Branch (79:5): [True: 0, False: 14.6k]
  ------------------
   80|  2.18k|    case AOM_IMG_FMT_I422: bps = 16; break;
  ------------------
  |  Branch (80:5): [True: 2.18k, False: 12.4k]
  ------------------
   81|  1.04k|    case AOM_IMG_FMT_I444: bps = 24; break;
  ------------------
  |  Branch (81:5): [True: 1.04k, False: 13.5k]
  ------------------
   82|      0|    case AOM_IMG_FMT_YV1216:
  ------------------
  |  Branch (82:5): [True: 0, False: 14.6k]
  ------------------
   83|  5.41k|    case AOM_IMG_FMT_I42016: bps = 24; break;
  ------------------
  |  Branch (83:5): [True: 5.41k, False: 9.21k]
  ------------------
   84|    140|    case AOM_IMG_FMT_I42216: bps = 32; break;
  ------------------
  |  Branch (84:5): [True: 140, False: 14.4k]
  ------------------
   85|  4.12k|    case AOM_IMG_FMT_I44416: bps = 48; break;
  ------------------
  |  Branch (85:5): [True: 4.12k, False: 10.5k]
  ------------------
   86|      0|    default: bps = 16; break;
  ------------------
  |  Branch (86:5): [True: 0, False: 14.6k]
  ------------------
   87|  14.6k|  }
   88|       |
   89|  14.6k|  bit_depth = (fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 16 : 8;
  ------------------
  |  |   38|  14.6k|#define AOM_IMG_FMT_HIGHBITDEPTH 0x800 /**< Image uses 16bit framebuffer. */
  ------------------
  |  Branch (89:15): [True: 9.68k, False: 4.94k]
  ------------------
   90|       |
   91|       |  /* Get chroma shift values for this format */
   92|  14.6k|  switch (fmt) {
   93|  1.71k|    case AOM_IMG_FMT_I420:
  ------------------
  |  Branch (93:5): [True: 1.71k, False: 12.9k]
  ------------------
   94|  1.71k|    case AOM_IMG_FMT_YV12:
  ------------------
  |  Branch (94:5): [True: 0, False: 14.6k]
  ------------------
   95|  1.71k|    case AOM_IMG_FMT_NV12:
  ------------------
  |  Branch (95:5): [True: 0, False: 14.6k]
  ------------------
   96|  1.71k|    case AOM_IMG_FMT_AOMI420:
  ------------------
  |  Branch (96:5): [True: 0, False: 14.6k]
  ------------------
   97|  1.71k|    case AOM_IMG_FMT_AOMYV12:
  ------------------
  |  Branch (97:5): [True: 0, False: 14.6k]
  ------------------
   98|  3.90k|    case AOM_IMG_FMT_I422:
  ------------------
  |  Branch (98:5): [True: 2.18k, False: 12.4k]
  ------------------
   99|  9.32k|    case AOM_IMG_FMT_I42016:
  ------------------
  |  Branch (99:5): [True: 5.41k, False: 9.21k]
  ------------------
  100|  9.32k|    case AOM_IMG_FMT_YV1216:
  ------------------
  |  Branch (100:5): [True: 0, False: 14.6k]
  ------------------
  101|  9.46k|    case AOM_IMG_FMT_I42216: xcs = 1; break;
  ------------------
  |  Branch (101:5): [True: 140, False: 14.4k]
  ------------------
  102|  5.16k|    default: xcs = 0; break;
  ------------------
  |  Branch (102:5): [True: 5.16k, False: 9.46k]
  ------------------
  103|  14.6k|  }
  104|       |
  105|  14.6k|  switch (fmt) {
  106|  1.71k|    case AOM_IMG_FMT_I420:
  ------------------
  |  Branch (106:5): [True: 1.71k, False: 12.9k]
  ------------------
  107|  1.71k|    case AOM_IMG_FMT_YV12:
  ------------------
  |  Branch (107:5): [True: 0, False: 14.6k]
  ------------------
  108|  1.71k|    case AOM_IMG_FMT_NV12:
  ------------------
  |  Branch (108:5): [True: 0, False: 14.6k]
  ------------------
  109|  1.71k|    case AOM_IMG_FMT_AOMI420:
  ------------------
  |  Branch (109:5): [True: 0, False: 14.6k]
  ------------------
  110|  1.71k|    case AOM_IMG_FMT_AOMYV12:
  ------------------
  |  Branch (110:5): [True: 0, False: 14.6k]
  ------------------
  111|  1.71k|    case AOM_IMG_FMT_YV1216:
  ------------------
  |  Branch (111:5): [True: 0, False: 14.6k]
  ------------------
  112|  7.13k|    case AOM_IMG_FMT_I42016: ycs = 1; break;
  ------------------
  |  Branch (112:5): [True: 5.41k, False: 9.21k]
  ------------------
  113|  7.49k|    default: ycs = 0; break;
  ------------------
  |  Branch (113:5): [True: 7.49k, False: 7.13k]
  ------------------
  114|  14.6k|  }
  115|       |
  116|       |  /* Calculate storage sizes given the chroma subsampling */
  117|  14.6k|  const unsigned int w = align_image_dimension(d_w, xcs, size_align);
  118|  14.6k|  assert(d_w <= w);
  119|  14.6k|  const unsigned int h = align_image_dimension(d_h, ycs, size_align);
  120|  14.6k|  assert(d_h <= h);
  121|       |
  122|  14.6k|  uint64_t s = (uint64_t)w + 2 * border;
  123|  14.6k|  s = (fmt & AOM_IMG_FMT_PLANAR) ? s : s * bps / bit_depth;
  ------------------
  |  |   35|  14.6k|#define AOM_IMG_FMT_PLANAR 0x100  /**< Image is a planar format. */
  ------------------
  |  Branch (123:7): [True: 14.6k, False: 0]
  ------------------
  124|  14.6k|  s = s * bit_depth / 8;
  125|  14.6k|  s = (s + stride_align - 1) & ~((uint64_t)stride_align - 1);
  126|  14.6k|  if (s > INT_MAX) goto fail;
  ------------------
  |  Branch (126:7): [True: 0, False: 14.6k]
  ------------------
  127|  14.6k|  const int stride_in_bytes = (int)s;
  128|       |
  129|       |  /* Allocate the new image */
  130|  14.6k|  if (!img) {
  ------------------
  |  Branch (130:7): [True: 0, False: 14.6k]
  ------------------
  131|      0|    img = (aom_image_t *)calloc(1, sizeof(aom_image_t));
  132|       |
  133|      0|    if (!img) goto fail;
  ------------------
  |  Branch (133:9): [True: 0, False: 0]
  ------------------
  134|       |
  135|      0|    img->self_allocd = 1;
  136|      0|  }
  137|       |
  138|  14.6k|  img->img_data = img_data;
  139|       |
  140|  14.6k|  if (!img_data) {
  ------------------
  |  Branch (140:7): [True: 14.6k, False: 0]
  ------------------
  141|  14.6k|    const uint64_t alloc_size =
  142|  14.6k|        (fmt & AOM_IMG_FMT_PLANAR)
  ------------------
  |  |   35|  14.6k|#define AOM_IMG_FMT_PLANAR 0x100  /**< Image is a planar format. */
  ------------------
  |  Branch (142:9): [True: 14.6k, False: 0]
  ------------------
  143|  14.6k|            ? (uint64_t)(h + 2 * border) * stride_in_bytes * bps / bit_depth
  144|  14.6k|            : (uint64_t)(h + 2 * border) * stride_in_bytes;
  145|       |
  146|  14.6k|    if (alloc_size != (size_t)alloc_size) goto fail;
  ------------------
  |  Branch (146:9): [True: 0, False: 14.6k]
  ------------------
  147|       |
  148|  14.6k|    if (alloc_cb) {
  ------------------
  |  Branch (148:9): [True: 14.6k, False: 0]
  ------------------
  149|  14.6k|      const size_t padded_alloc_size = (size_t)alloc_size + buf_align - 1;
  150|  14.6k|      img->img_data = (uint8_t *)alloc_cb(cb_priv, padded_alloc_size);
  151|  14.6k|      if (img->img_data) {
  ------------------
  |  Branch (151:11): [True: 14.6k, False: 0]
  ------------------
  152|  14.6k|        img->img_data = (uint8_t *)aom_align_addr(img->img_data, buf_align);
  ------------------
  |  |   49|  14.6k|  (void *)(((uintptr_t)(addr) + ((align)-1)) & ~(uintptr_t)((align)-1))
  ------------------
  153|  14.6k|      }
  154|  14.6k|      img->img_data_owner = 0;
  155|  14.6k|    } else {
  156|      0|      img->img_data = (uint8_t *)aom_memalign(buf_align, (size_t)alloc_size);
  157|      0|      img->img_data_owner = 1;
  158|      0|    }
  159|  14.6k|    img->sz = (size_t)alloc_size;
  160|  14.6k|  }
  161|       |
  162|  14.6k|  if (!img->img_data) goto fail;
  ------------------
  |  Branch (162:7): [True: 0, False: 14.6k]
  ------------------
  163|       |
  164|  14.6k|  img->fmt = fmt;
  165|  14.6k|  img->bit_depth = bit_depth;
  166|       |  // aligned width and aligned height
  167|  14.6k|  img->w = w;
  168|  14.6k|  img->h = h;
  169|  14.6k|  img->x_chroma_shift = xcs;
  170|  14.6k|  img->y_chroma_shift = ycs;
  171|  14.6k|  img->bps = bps;
  172|       |
  173|       |  /* Calculate strides */
  174|  14.6k|  img->stride[AOM_PLANE_Y] = stride_in_bytes;
  ------------------
  |  |  226|  14.6k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  175|  14.6k|  img->stride[AOM_PLANE_U] = img->stride[AOM_PLANE_V] = stride_in_bytes >> xcs;
  ------------------
  |  |  227|  14.6k|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
                img->stride[AOM_PLANE_U] = img->stride[AOM_PLANE_V] = stride_in_bytes >> xcs;
  ------------------
  |  |  228|  14.6k|#define AOM_PLANE_V 2      /**< V (Chroma) plane */
  ------------------
  176|       |
  177|  14.6k|  if (fmt == AOM_IMG_FMT_NV12) {
  ------------------
  |  Branch (177:7): [True: 0, False: 14.6k]
  ------------------
  178|       |    // Each row is a row of U and a row of V interleaved, so the stride is twice
  179|       |    // as long.
  180|      0|    img->stride[AOM_PLANE_U] *= 2;
  ------------------
  |  |  227|      0|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
  181|      0|    img->stride[AOM_PLANE_V] = 0;
  ------------------
  |  |  228|      0|#define AOM_PLANE_V 2      /**< V (Chroma) plane */
  ------------------
  182|      0|  }
  183|       |
  184|  14.6k|  img->cp = AOM_CICP_CP_UNSPECIFIED;
  185|  14.6k|  img->tc = AOM_CICP_TC_UNSPECIFIED;
  186|  14.6k|  img->mc = AOM_CICP_MC_UNSPECIFIED;
  187|       |
  188|       |  /* Default viewport to entire image. (This aom_img_set_rect call always
  189|       |   * succeeds.) */
  190|  14.6k|  int ret = aom_img_set_rect(img, 0, 0, d_w, d_h, border);
  191|  14.6k|  assert(ret == 0);
  192|  14.6k|  (void)ret;
  193|  14.6k|  return img;
  194|       |
  195|      0|fail:
  196|      0|  aom_img_free(img);
  197|      0|  return NULL;
  198|  14.6k|}
aom_image.c:align_image_dimension:
   25|  29.2k|                                                 unsigned int size_align) {
   26|  29.2k|  unsigned int align;
   27|       |
   28|  29.2k|  align = (1 << subsampling) - 1;
   29|  29.2k|  align = (size_align - 1 > align) ? (size_align - 1) : align;
  ------------------
  |  Branch (29:11): [True: 0, False: 29.2k]
  ------------------
   30|  29.2k|  return ((d + align) & ~align);
   31|  29.2k|}

aom_uleb_decode:
   32|  1.31M|                    size_t *length) {
   33|  1.31M|  if (buffer && value) {
  ------------------
  |  Branch (33:7): [True: 1.31M, False: 0]
  |  Branch (33:17): [True: 1.31M, False: 0]
  ------------------
   34|  1.31M|    *value = 0;
   35|  1.36M|    for (size_t i = 0; i < kMaximumLeb128Size && i < available; ++i) {
  ------------------
  |  Branch (35:24): [True: 1.36M, False: 195]
  |  Branch (35:50): [True: 1.36M, False: 2.09k]
  ------------------
   36|  1.36M|      const uint8_t decoded_byte = *(buffer + i) & kLeb128ByteMask;
   37|  1.36M|      *value |= ((uint64_t)decoded_byte) << (i * 7);
   38|  1.36M|      if ((*(buffer + i) >> 7) == 0) {
  ------------------
  |  Branch (38:11): [True: 1.31M, False: 45.5k]
  ------------------
   39|  1.31M|        if (length) {
  ------------------
  |  Branch (39:13): [True: 1.31M, False: 0]
  ------------------
   40|  1.31M|          *length = i + 1;
   41|  1.31M|        }
   42|       |
   43|       |        // Fail on values larger than 32-bits to ensure consistent behavior on
   44|       |        // 32 and 64 bit targets: value is typically used to determine buffer
   45|       |        // allocation size.
   46|  1.31M|        if (*value > UINT32_MAX) return -1;
  ------------------
  |  Branch (46:13): [True: 1.68k, False: 1.31M]
  ------------------
   47|       |
   48|  1.31M|        return 0;
   49|  1.31M|      }
   50|  1.36M|    }
   51|  1.31M|  }
   52|       |
   53|       |  // If we get here, either the buffer/value pointers were invalid,
   54|       |  // or we ran over the available space
   55|  2.28k|  return -1;
   56|  1.31M|}

decodeframe.c:clamp:
   74|  42.6M|static inline int clamp(int value, int low, int high) {
   75|  42.6M|  return value < low ? low : (value > high ? high : value);
  ------------------
  |  Branch (75:10): [True: 1.92M, False: 40.7M]
  |  Branch (75:31): [True: 1.01M, False: 39.7M]
  ------------------
   76|  42.6M|}
decodemv.c:clamp:
   74|  3.86M|static inline int clamp(int value, int low, int high) {
   75|  3.86M|  return value < low ? low : (value > high ? high : value);
  ------------------
  |  Branch (75:10): [True: 8.34k, False: 3.86M]
  |  Branch (75:31): [True: 127k, False: 3.73M]
  ------------------
   76|  3.86M|}
decodetxb.c:clamp:
   74|   103M|static inline int clamp(int value, int low, int high) {
   75|   103M|  return value < low ? low : (value > high ? high : value);
  ------------------
  |  Branch (75:10): [True: 4.74k, False: 103M]
  |  Branch (75:31): [True: 7.80k, False: 103M]
  ------------------
   76|   103M|}
grain_synthesis.c:clamp:
   74|   177M|static inline int clamp(int value, int low, int high) {
   75|   177M|  return value < low ? low : (value > high ? high : value);
  ------------------
  |  Branch (75:10): [True: 32.2M, False: 145M]
  |  Branch (75:31): [True: 23.2M, False: 121M]
  ------------------
   76|   177M|}
blend_a64_mask.c:negative_to_zero:
   99|  54.9k|static inline unsigned int negative_to_zero(int value) {
  100|  54.9k|  return value & ~(value >> (sizeof(value) * 8 - 1));
  101|  54.9k|}
av1_inv_txfm2d.c:clamp:
   74|  10.0M|static inline int clamp(int value, int low, int high) {
   75|  10.0M|  return value < low ? low : (value > high ? high : value);
  ------------------
  |  Branch (75:10): [True: 143k, False: 9.93M]
  |  Branch (75:31): [True: 160k, False: 9.77M]
  ------------------
   76|  10.0M|}
av1_inv_txfm2d.c:clip_pixel_highbd:
   86|  10.0M|static inline uint16_t clip_pixel_highbd(int val, int bd) {
   87|  10.0M|  switch (bd) {
   88|  9.07M|    case 8:
  ------------------
  |  Branch (88:5): [True: 9.07M, False: 1.01M]
  ------------------
   89|  9.07M|    default: return (uint16_t)clamp(val, 0, 255);
  ------------------
  |  Branch (89:5): [True: 0, False: 10.0M]
  ------------------
   90|   698k|    case 10: return (uint16_t)clamp(val, 0, 1023);
  ------------------
  |  Branch (90:5): [True: 698k, False: 9.38M]
  ------------------
   91|   313k|    case 12: return (uint16_t)clamp(val, 0, 4095);
  ------------------
  |  Branch (91:5): [True: 313k, False: 9.76M]
  ------------------
   92|  10.0M|  }
   93|  10.0M|}
av1_loopfilter.c:clamp:
   74|   105M|static inline int clamp(int value, int low, int high) {
   75|  18.4E|  return value < low ? low : (value > high ? high : value);
  ------------------
  |  Branch (75:10): [True: 60.2M, False: 45.4M]
  |  Branch (75:31): [True: 83.9M, False: 18.4E]
  ------------------
   76|   105M|}
convolve.c:clip_pixel:
   70|   890k|static inline uint8_t clip_pixel(int val) {
   71|   890k|  return (val > 255) ? 255 : (val < 0) ? 0 : val;
  ------------------
  |  Branch (71:10): [True: 0, False: 890k]
  |  Branch (71:30): [True: 0, False: 890k]
  ------------------
   72|   890k|}
convolve.c:clip_pixel_highbd:
   86|   679k|static inline uint16_t clip_pixel_highbd(int val, int bd) {
   87|   679k|  switch (bd) {
   88|      0|    case 8:
  ------------------
  |  Branch (88:5): [True: 0, False: 679k]
  ------------------
   89|      0|    default: return (uint16_t)clamp(val, 0, 255);
  ------------------
  |  Branch (89:5): [True: 0, False: 679k]
  ------------------
   90|   250k|    case 10: return (uint16_t)clamp(val, 0, 1023);
  ------------------
  |  Branch (90:5): [True: 250k, False: 429k]
  ------------------
   91|   429k|    case 12: return (uint16_t)clamp(val, 0, 4095);
  ------------------
  |  Branch (91:5): [True: 429k, False: 250k]
  ------------------
   92|   679k|  }
   93|   679k|}
convolve.c:clamp:
   74|   679k|static inline int clamp(int value, int low, int high) {
   75|   679k|  return value < low ? low : (value > high ? high : value);
  ------------------
  |  Branch (75:10): [True: 0, False: 679k]
  |  Branch (75:31): [True: 0, False: 679k]
  ------------------
   76|   679k|}
mvref_common.c:clamp:
   74|  72.4M|static inline int clamp(int value, int low, int high) {
   75|  72.4M|  return value < low ? low : (value > high ? high : value);
  ------------------
  |  Branch (75:10): [True: 672k, False: 71.7M]
  |  Branch (75:31): [True: 1.04M, False: 70.7M]
  ------------------
   76|  72.4M|}
quant_common.c:clamp:
   74|   100M|static inline int clamp(int value, int low, int high) {
   75|   100M|  return value < low ? low : (value > high ? high : value);
  ------------------
  |  Branch (75:10): [True: 4.06M, False: 96.9M]
  |  Branch (75:31): [True: 1.20M, False: 95.7M]
  ------------------
   76|   100M|}
reconinter.c:clamp:
   74|  1.25M|static inline int clamp(int value, int low, int high) {
   75|  1.25M|  return value < low ? low : (value > high ? high : value);
  ------------------
  |  Branch (75:10): [True: 0, False: 1.25M]
  |  Branch (75:31): [True: 224k, False: 1.03M]
  ------------------
   76|  1.25M|}
reconintra.c:clip_pixel_highbd:
   86|  96.9M|static inline uint16_t clip_pixel_highbd(int val, int bd) {
   87|  96.9M|  switch (bd) {
   88|      0|    case 8:
  ------------------
  |  Branch (88:5): [True: 0, False: 96.9M]
  ------------------
   89|      0|    default: return (uint16_t)clamp(val, 0, 255);
  ------------------
  |  Branch (89:5): [True: 0, False: 96.9M]
  ------------------
   90|  89.9M|    case 10: return (uint16_t)clamp(val, 0, 1023);
  ------------------
  |  Branch (90:5): [True: 89.9M, False: 7.07M]
  ------------------
   91|  7.29M|    case 12: return (uint16_t)clamp(val, 0, 4095);
  ------------------
  |  Branch (91:5): [True: 7.29M, False: 89.7M]
  ------------------
   92|  96.9M|  }
   93|  96.9M|}
reconintra.c:clamp:
   74|  97.2M|static inline int clamp(int value, int low, int high) {
   75|  97.2M|  return value < low ? low : (value > high ? high : value);
  ------------------
  |  Branch (75:10): [True: 70.5k, False: 97.1M]
  |  Branch (75:31): [True: 112k, False: 97.0M]
  ------------------
   76|  97.2M|}
warped_motion.c:clamp:
   74|  2.91M|static inline int clamp(int value, int low, int high) {
   75|  2.91M|  return value < low ? low : (value > high ? high : value);
  ------------------
  |  Branch (75:10): [True: 52.6k, False: 2.86M]
  |  Branch (75:31): [True: 70.8k, False: 2.79M]
  ------------------
   76|  2.91M|}
warped_motion.c:clamp64:
   78|  1.10M|static inline int64_t clamp64(int64_t value, int64_t low, int64_t high) {
   79|  1.10M|  return value < low ? low : (value > high ? high : value);
  ------------------
  |  Branch (79:10): [True: 39.4k, False: 1.06M]
  |  Branch (79:31): [True: 33.3k, False: 1.03M]
  ------------------
   80|  1.10M|}
av1_convolve_scale_sse4.c:clip_pixel:
   70|  1.18M|static inline uint8_t clip_pixel(int val) {
   71|  1.18M|  return (val > 255) ? 255 : (val < 0) ? 0 : val;
  ------------------
  |  Branch (71:10): [True: 4.37k, False: 1.17M]
  |  Branch (71:30): [True: 3.56k, False: 1.17M]
  ------------------
   72|  1.18M|}
av1_convolve_scale_sse4.c:clip_pixel_highbd:
   86|   364k|static inline uint16_t clip_pixel_highbd(int val, int bd) {
   87|   364k|  switch (bd) {
   88|      0|    case 8:
  ------------------
  |  Branch (88:5): [True: 0, False: 364k]
  ------------------
   89|      0|    default: return (uint16_t)clamp(val, 0, 255);
  ------------------
  |  Branch (89:5): [True: 0, False: 364k]
  ------------------
   90|   270k|    case 10: return (uint16_t)clamp(val, 0, 1023);
  ------------------
  |  Branch (90:5): [True: 270k, False: 93.4k]
  ------------------
   91|  93.4k|    case 12: return (uint16_t)clamp(val, 0, 4095);
  ------------------
  |  Branch (91:5): [True: 93.4k, False: 270k]
  ------------------
   92|   364k|  }
   93|   364k|}
av1_convolve_scale_sse4.c:clamp:
   74|   364k|static inline int clamp(int value, int low, int high) {
   75|   364k|  return value < low ? low : (value > high ? high : value);
  ------------------
  |  Branch (75:10): [True: 3.55k, False: 360k]
  |  Branch (75:31): [True: 3.61k, False: 357k]
  ------------------
   76|   364k|}
warp_plane_avx2.c:clamp:
   74|  30.0M|static inline int clamp(int value, int low, int high) {
   75|  30.0M|  return value < low ? low : (value > high ? high : value);
  ------------------
  |  Branch (75:10): [True: 1.81M, False: 28.2M]
  |  Branch (75:31): [True: 1.72M, False: 26.5M]
  ------------------
   76|  30.0M|}
highbd_warp_affine_avx2.c:clamp:
   74|  89.8M|static inline int clamp(int value, int low, int high) {
   75|  89.8M|  return value < low ? low : (value > high ? high : value);
  ------------------
  |  Branch (75:10): [True: 13.4M, False: 76.3M]
  |  Branch (75:31): [True: 23.2M, False: 53.0M]
  ------------------
   76|  89.8M|}

aom_dsp_rtcd:
   18|  16.1k|void aom_dsp_rtcd(void) { aom_once(setup_rtcd_internal); }

aom_read_primitive_refsubexpfin_:
   57|   384k|                                          uint16_t ref ACCT_STR_PARAM) {
   58|   384k|  return inv_recenter_finite_nonneg(
   59|   384k|      n, ref, read_primitive_subexpfin(r, n, k, ACCT_STR_NAME));
  ------------------
  |  |   18|   384k|  read_primitive_subexpfin_(r, n, k ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
   60|   384k|}
binary_codes_reader.c:read_primitive_subexpfin_:
   32|   384k|                                          uint16_t k ACCT_STR_PARAM) {
   33|   384k|  int i = 0;
   34|   384k|  int mk = 0;
   35|       |
   36|   686k|  while (1) {
  ------------------
  |  Branch (36:10): [Folded - Ignored]
  ------------------
   37|   686k|    int b = (i ? k + i - 1 : k);
  ------------------
  |  Branch (37:14): [True: 303k, False: 383k]
  ------------------
   38|   686k|    int a = (1 << b);
   39|       |
   40|   686k|    if (n <= mk + 3 * a) {
  ------------------
  |  Branch (40:9): [True: 112k, False: 574k]
  ------------------
   41|   112k|      return read_primitive_quniform(r, n - mk, ACCT_STR_NAME) + mk;
  ------------------
  |  |   16|   112k|  read_primitive_quniform_(r, n ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
   42|   112k|    }
   43|       |
   44|   574k|    if (!aom_read_bit(r, ACCT_STR_NAME)) {
  ------------------
  |  |   43|   574k|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  |  Branch (44:9): [True: 272k, False: 301k]
  ------------------
   45|   272k|      return aom_read_literal(r, b, ACCT_STR_NAME) + mk;
  ------------------
  |  |   47|   272k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
   46|   272k|    }
   47|       |
   48|   301k|    i = i + 1;
   49|   301k|    mk += a;
   50|   301k|  }
   51|       |
   52|  18.4E|  assert(0);
   53|      0|  return 0;
   54|  18.4E|}
binary_codes_reader.c:read_primitive_quniform_:
   21|   112k|                                         uint16_t n ACCT_STR_PARAM) {
   22|   112k|  if (n <= 1) return 0;
  ------------------
  |  Branch (22:7): [True: 0, False: 112k]
  ------------------
   23|   112k|  const int l = get_msb(n) + 1;
   24|   112k|  const int m = (1 << l) - n;
   25|   112k|  const int v = aom_read_literal(r, l - 1, ACCT_STR_NAME);
  ------------------
  |  |   47|   112k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
   26|   112k|  return v < m ? v : (v << 1) - m + aom_read_bit(r, ACCT_STR_NAME);
  ------------------
  |  |   43|   176k|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  |  Branch (26:10): [True: 47.1k, False: 64.9k]
  ------------------
   27|   112k|}

aom_reader_init:
   14|   162k|int aom_reader_init(aom_reader *r, const uint8_t *buffer, size_t size) {
   15|   162k|  if (size && !buffer) {
  ------------------
  |  Branch (15:7): [True: 162k, False: 16]
  |  Branch (15:15): [True: 0, False: 162k]
  ------------------
   16|      0|    return 1;
   17|      0|  }
   18|   162k|  r->buffer_end = buffer + size;
   19|   162k|  r->buffer = buffer;
   20|   162k|  od_ec_dec_init(&r->ec, buffer, (uint32_t)size);
   21|       |#if CONFIG_ACCOUNTING
   22|       |  r->accounting = NULL;
   23|       |#endif
   24|   162k|  return 0;
   25|   162k|}
aom_reader_find_begin:
   27|   103k|const uint8_t *aom_reader_find_begin(aom_reader *r) { return r->buffer; }
aom_reader_find_end:
   29|   187k|const uint8_t *aom_reader_find_end(aom_reader *r) { return r->buffer_end; }
aom_reader_tell:
   31|  1.36M|uint32_t aom_reader_tell(const aom_reader *r) { return od_ec_dec_tell(&r->ec); }
aom_reader_has_overflowed:
   37|  1.25M|int aom_reader_has_overflowed(const aom_reader *r) {
   38|  1.25M|  const uint32_t tell_bits = aom_reader_tell(r);
   39|  1.25M|  const uint32_t tell_bytes = (tell_bits + 7) >> 3;
   40|  1.25M|  return ((ptrdiff_t)tell_bytes > r->buffer_end - r->buffer);
   41|  1.25M|}

decodeframe.c:aom_read_symbol_:
  221|  13.5M|                                   int nsymbs ACCT_STR_PARAM) {
  222|  13.5M|  int ret;
  223|  13.5M|  ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME);
  ------------------
  |  |   49|  13.5M|  aom_read_cdf_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  224|  13.5M|  if (r->allow_update_cdf) update_cdf(cdf, ret, nsymbs);
  ------------------
  |  Branch (224:7): [True: 12.9M, False: 588k]
  ------------------
  225|  13.5M|  return ret;
  226|  13.5M|}
decodeframe.c:aom_read_cdf_:
  169|  13.8M|                                int nsymbs ACCT_STR_PARAM) {
  170|  13.8M|  int symb;
  171|  13.8M|  assert(cdf != NULL);
  172|  13.8M|  symb = od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs);
  173|       |
  174|       |#if CONFIG_BITSTREAM_DEBUG
  175|       |  {
  176|       |    int i;
  177|       |    int cdf_error = 0;
  178|       |    int ref_symb, ref_nsymbs;
  179|       |    aom_cdf_prob ref_cdf[16];
  180|       |    const int queue_r = bitstream_queue_get_read();
  181|       |    const int frame_idx = aom_bitstream_queue_get_frame_read();
  182|       |    bitstream_queue_pop(&ref_symb, ref_cdf, &ref_nsymbs);
  183|       |    if (nsymbs != ref_nsymbs) {
  184|       |      fprintf(stderr,
  185|       |              "\n *** nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs %d "
  186|       |              "queue_r %d\n",
  187|       |              frame_idx, nsymbs, ref_nsymbs, queue_r);
  188|       |      cdf_error = 0;
  189|       |      assert(0);
  190|       |    } else {
  191|       |      for (i = 0; i < nsymbs; ++i)
  192|       |        if (cdf[i] != ref_cdf[i]) cdf_error = 1;
  193|       |    }
  194|       |    if (cdf_error) {
  195|       |      fprintf(stderr, "\n *** cdf error, frame_idx_r %d cdf {%d", frame_idx,
  196|       |              cdf[0]);
  197|       |      for (i = 1; i < nsymbs; ++i) fprintf(stderr, ", %d", cdf[i]);
  198|       |      fprintf(stderr, "} ref_cdf {%d", ref_cdf[0]);
  199|       |      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
  200|       |      fprintf(stderr, "} queue_r %d\n", queue_r);
  201|       |      assert(0);
  202|       |    }
  203|       |    if (symb != ref_symb) {
  204|       |      fprintf(
  205|       |          stderr,
  206|       |          "\n *** symb error, frame_idx_r %d symb %d ref_symb %d queue_r %d\n",
  207|       |          frame_idx, symb, ref_symb, queue_r);
  208|       |      assert(0);
  209|       |    }
  210|       |  }
  211|       |#endif
  212|       |
  213|       |#if CONFIG_ACCOUNTING
  214|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  215|       |  aom_update_symb_counts(r, (nsymbs == 2));
  216|       |#endif
  217|  13.8M|  return symb;
  218|  13.8M|}
decodeframe.c:aom_read_literal_:
  158|   106k|static inline int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) {
  159|   106k|  int literal = 0, bit;
  160|       |
  161|   530k|  for (bit = bits - 1; bit >= 0; bit--) literal |= aom_read_bit(r, NULL) << bit;
  ------------------
  |  |   43|   423k|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  |  Branch (161:24): [True: 423k, False: 106k]
  ------------------
  162|       |#if CONFIG_ACCOUNTING
  163|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  164|       |#endif
  165|   106k|  return literal;
  166|   106k|}
decodeframe.c:aom_read_bit_:
  149|   423k|static inline int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) {
  150|   423k|  int ret;
  151|   423k|  ret = aom_read(r, 128, NULL);  // aom_prob_half
  ------------------
  |  |   41|   423k|  aom_read_(r, prob ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  152|       |#if CONFIG_ACCOUNTING
  153|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  154|       |#endif
  155|   423k|  return ret;
  156|   423k|}
decodeframe.c:aom_read_:
  104|   423k|static inline int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
  105|   423k|  int p = (0x7FFFFF - (prob << 15) + prob) >> 8;
  106|   423k|  int bit = od_ec_decode_bool_q15(&r->ec, p);
  107|       |
  108|       |#if CONFIG_BITSTREAM_DEBUG
  109|       |  {
  110|       |    int i;
  111|       |    int ref_bit, ref_nsymbs;
  112|       |    aom_cdf_prob ref_cdf[16];
  113|       |    const int queue_r = bitstream_queue_get_read();
  114|       |    const int frame_idx = aom_bitstream_queue_get_frame_read();
  115|       |    bitstream_queue_pop(&ref_bit, ref_cdf, &ref_nsymbs);
  116|       |    if (ref_nsymbs != 2) {
  117|       |      fprintf(stderr,
  118|       |              "\n *** [bit] nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs "
  119|       |              "%d queue_r %d\n",
  120|       |              frame_idx, 2, ref_nsymbs, queue_r);
  121|       |      assert(0);
  122|       |    }
  123|       |    if ((ref_nsymbs != 2) || (ref_cdf[0] != (aom_cdf_prob)p) ||
  124|       |        (ref_cdf[1] != 32767)) {
  125|       |      fprintf(stderr,
  126|       |              "\n *** [bit] cdf error, frame_idx_r %d cdf {%d, %d} ref_cdf {%d",
  127|       |              frame_idx, p, 32767, ref_cdf[0]);
  128|       |      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
  129|       |      fprintf(stderr, "} queue_r %d\n", queue_r);
  130|       |      assert(0);
  131|       |    }
  132|       |    if (bit != ref_bit) {
  133|       |      fprintf(stderr,
  134|       |              "\n *** [bit] symb error, frame_idx_r %d symb %d ref_symb %d "
  135|       |              "queue_r %d\n",
  136|       |              frame_idx, bit, ref_bit, queue_r);
  137|       |      assert(0);
  138|       |    }
  139|       |  }
  140|       |#endif
  141|       |
  142|       |#if CONFIG_ACCOUNTING
  143|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  144|       |  aom_update_symb_counts(r, 1);
  145|       |#endif
  146|   423k|  return bit;
  147|   423k|}
decodemv.c:aom_read_symbol_:
  221|   110M|                                   int nsymbs ACCT_STR_PARAM) {
  222|   110M|  int ret;
  223|   110M|  ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME);
  ------------------
  |  |   49|   110M|  aom_read_cdf_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  224|   110M|  if (r->allow_update_cdf) update_cdf(cdf, ret, nsymbs);
  ------------------
  |  Branch (224:7): [True: 106M, False: 3.39M]
  ------------------
  225|   110M|  return ret;
  226|   110M|}
decodemv.c:aom_read_cdf_:
  169|   110M|                                int nsymbs ACCT_STR_PARAM) {
  170|   110M|  int symb;
  171|   110M|  assert(cdf != NULL);
  172|   110M|  symb = od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs);
  173|       |
  174|       |#if CONFIG_BITSTREAM_DEBUG
  175|       |  {
  176|       |    int i;
  177|       |    int cdf_error = 0;
  178|       |    int ref_symb, ref_nsymbs;
  179|       |    aom_cdf_prob ref_cdf[16];
  180|       |    const int queue_r = bitstream_queue_get_read();
  181|       |    const int frame_idx = aom_bitstream_queue_get_frame_read();
  182|       |    bitstream_queue_pop(&ref_symb, ref_cdf, &ref_nsymbs);
  183|       |    if (nsymbs != ref_nsymbs) {
  184|       |      fprintf(stderr,
  185|       |              "\n *** nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs %d "
  186|       |              "queue_r %d\n",
  187|       |              frame_idx, nsymbs, ref_nsymbs, queue_r);
  188|       |      cdf_error = 0;
  189|       |      assert(0);
  190|       |    } else {
  191|       |      for (i = 0; i < nsymbs; ++i)
  192|       |        if (cdf[i] != ref_cdf[i]) cdf_error = 1;
  193|       |    }
  194|       |    if (cdf_error) {
  195|       |      fprintf(stderr, "\n *** cdf error, frame_idx_r %d cdf {%d", frame_idx,
  196|       |              cdf[0]);
  197|       |      for (i = 1; i < nsymbs; ++i) fprintf(stderr, ", %d", cdf[i]);
  198|       |      fprintf(stderr, "} ref_cdf {%d", ref_cdf[0]);
  199|       |      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
  200|       |      fprintf(stderr, "} queue_r %d\n", queue_r);
  201|       |      assert(0);
  202|       |    }
  203|       |    if (symb != ref_symb) {
  204|       |      fprintf(
  205|       |          stderr,
  206|       |          "\n *** symb error, frame_idx_r %d symb %d ref_symb %d queue_r %d\n",
  207|       |          frame_idx, symb, ref_symb, queue_r);
  208|       |      assert(0);
  209|       |    }
  210|       |  }
  211|       |#endif
  212|       |
  213|       |#if CONFIG_ACCOUNTING
  214|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  215|       |  aom_update_symb_counts(r, (nsymbs == 2));
  216|       |#endif
  217|   110M|  return symb;
  218|   110M|}
decodemv.c:aom_read_literal_:
  158|  2.03M|static inline int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) {
  159|  2.03M|  int literal = 0, bit;
  160|       |
  161|  11.0M|  for (bit = bits - 1; bit >= 0; bit--) literal |= aom_read_bit(r, NULL) << bit;
  ------------------
  |  |   43|  9.03M|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  |  Branch (161:24): [True: 9.03M, False: 2.03M]
  ------------------
  162|       |#if CONFIG_ACCOUNTING
  163|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  164|       |#endif
  165|  2.03M|  return literal;
  166|  2.03M|}
decodemv.c:aom_read_bit_:
  149|  9.64M|static inline int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) {
  150|  9.64M|  int ret;
  151|  9.64M|  ret = aom_read(r, 128, NULL);  // aom_prob_half
  ------------------
  |  |   41|  9.64M|  aom_read_(r, prob ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  152|       |#if CONFIG_ACCOUNTING
  153|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  154|       |#endif
  155|  9.64M|  return ret;
  156|  9.64M|}
decodemv.c:aom_read_:
  104|  9.64M|static inline int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
  105|  9.64M|  int p = (0x7FFFFF - (prob << 15) + prob) >> 8;
  106|  9.64M|  int bit = od_ec_decode_bool_q15(&r->ec, p);
  107|       |
  108|       |#if CONFIG_BITSTREAM_DEBUG
  109|       |  {
  110|       |    int i;
  111|       |    int ref_bit, ref_nsymbs;
  112|       |    aom_cdf_prob ref_cdf[16];
  113|       |    const int queue_r = bitstream_queue_get_read();
  114|       |    const int frame_idx = aom_bitstream_queue_get_frame_read();
  115|       |    bitstream_queue_pop(&ref_bit, ref_cdf, &ref_nsymbs);
  116|       |    if (ref_nsymbs != 2) {
  117|       |      fprintf(stderr,
  118|       |              "\n *** [bit] nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs "
  119|       |              "%d queue_r %d\n",
  120|       |              frame_idx, 2, ref_nsymbs, queue_r);
  121|       |      assert(0);
  122|       |    }
  123|       |    if ((ref_nsymbs != 2) || (ref_cdf[0] != (aom_cdf_prob)p) ||
  124|       |        (ref_cdf[1] != 32767)) {
  125|       |      fprintf(stderr,
  126|       |              "\n *** [bit] cdf error, frame_idx_r %d cdf {%d, %d} ref_cdf {%d",
  127|       |              frame_idx, p, 32767, ref_cdf[0]);
  128|       |      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
  129|       |      fprintf(stderr, "} queue_r %d\n", queue_r);
  130|       |      assert(0);
  131|       |    }
  132|       |    if (bit != ref_bit) {
  133|       |      fprintf(stderr,
  134|       |              "\n *** [bit] symb error, frame_idx_r %d symb %d ref_symb %d "
  135|       |              "queue_r %d\n",
  136|       |              frame_idx, bit, ref_bit, queue_r);
  137|       |      assert(0);
  138|       |    }
  139|       |  }
  140|       |#endif
  141|       |
  142|       |#if CONFIG_ACCOUNTING
  143|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  144|       |  aom_update_symb_counts(r, 1);
  145|       |#endif
  146|  9.64M|  return bit;
  147|  9.64M|}
decodetxb.c:aom_read_symbol_:
  221|   374M|                                   int nsymbs ACCT_STR_PARAM) {
  222|   374M|  int ret;
  223|   374M|  ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME);
  ------------------
  |  |   49|   374M|  aom_read_cdf_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  224|   374M|  if (r->allow_update_cdf) update_cdf(cdf, ret, nsymbs);
  ------------------
  |  Branch (224:7): [True: 361M, False: 12.1M]
  ------------------
  225|   374M|  return ret;
  226|   374M|}
decodetxb.c:aom_read_cdf_:
  169|   372M|                                int nsymbs ACCT_STR_PARAM) {
  170|   372M|  int symb;
  171|   372M|  assert(cdf != NULL);
  172|   372M|  symb = od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs);
  173|       |
  174|       |#if CONFIG_BITSTREAM_DEBUG
  175|       |  {
  176|       |    int i;
  177|       |    int cdf_error = 0;
  178|       |    int ref_symb, ref_nsymbs;
  179|       |    aom_cdf_prob ref_cdf[16];
  180|       |    const int queue_r = bitstream_queue_get_read();
  181|       |    const int frame_idx = aom_bitstream_queue_get_frame_read();
  182|       |    bitstream_queue_pop(&ref_symb, ref_cdf, &ref_nsymbs);
  183|       |    if (nsymbs != ref_nsymbs) {
  184|       |      fprintf(stderr,
  185|       |              "\n *** nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs %d "
  186|       |              "queue_r %d\n",
  187|       |              frame_idx, nsymbs, ref_nsymbs, queue_r);
  188|       |      cdf_error = 0;
  189|       |      assert(0);
  190|       |    } else {
  191|       |      for (i = 0; i < nsymbs; ++i)
  192|       |        if (cdf[i] != ref_cdf[i]) cdf_error = 1;
  193|       |    }
  194|       |    if (cdf_error) {
  195|       |      fprintf(stderr, "\n *** cdf error, frame_idx_r %d cdf {%d", frame_idx,
  196|       |              cdf[0]);
  197|       |      for (i = 1; i < nsymbs; ++i) fprintf(stderr, ", %d", cdf[i]);
  198|       |      fprintf(stderr, "} ref_cdf {%d", ref_cdf[0]);
  199|       |      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
  200|       |      fprintf(stderr, "} queue_r %d\n", queue_r);
  201|       |      assert(0);
  202|       |    }
  203|       |    if (symb != ref_symb) {
  204|       |      fprintf(
  205|       |          stderr,
  206|       |          "\n *** symb error, frame_idx_r %d symb %d ref_symb %d queue_r %d\n",
  207|       |          frame_idx, symb, ref_symb, queue_r);
  208|       |      assert(0);
  209|       |    }
  210|       |  }
  211|       |#endif
  212|       |
  213|       |#if CONFIG_ACCOUNTING
  214|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  215|       |  aom_update_symb_counts(r, (nsymbs == 2));
  216|       |#endif
  217|   372M|  return symb;
  218|   372M|}
decodetxb.c:aom_read_bit_:
  149|   110M|static inline int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) {
  150|   110M|  int ret;
  151|   110M|  ret = aom_read(r, 128, NULL);  // aom_prob_half
  ------------------
  |  |   41|   110M|  aom_read_(r, prob ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  152|       |#if CONFIG_ACCOUNTING
  153|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  154|       |#endif
  155|   110M|  return ret;
  156|   110M|}
decodetxb.c:aom_read_:
  104|   110M|static inline int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
  105|   110M|  int p = (0x7FFFFF - (prob << 15) + prob) >> 8;
  106|   110M|  int bit = od_ec_decode_bool_q15(&r->ec, p);
  107|       |
  108|       |#if CONFIG_BITSTREAM_DEBUG
  109|       |  {
  110|       |    int i;
  111|       |    int ref_bit, ref_nsymbs;
  112|       |    aom_cdf_prob ref_cdf[16];
  113|       |    const int queue_r = bitstream_queue_get_read();
  114|       |    const int frame_idx = aom_bitstream_queue_get_frame_read();
  115|       |    bitstream_queue_pop(&ref_bit, ref_cdf, &ref_nsymbs);
  116|       |    if (ref_nsymbs != 2) {
  117|       |      fprintf(stderr,
  118|       |              "\n *** [bit] nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs "
  119|       |              "%d queue_r %d\n",
  120|       |              frame_idx, 2, ref_nsymbs, queue_r);
  121|       |      assert(0);
  122|       |    }
  123|       |    if ((ref_nsymbs != 2) || (ref_cdf[0] != (aom_cdf_prob)p) ||
  124|       |        (ref_cdf[1] != 32767)) {
  125|       |      fprintf(stderr,
  126|       |              "\n *** [bit] cdf error, frame_idx_r %d cdf {%d, %d} ref_cdf {%d",
  127|       |              frame_idx, p, 32767, ref_cdf[0]);
  128|       |      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
  129|       |      fprintf(stderr, "} queue_r %d\n", queue_r);
  130|       |      assert(0);
  131|       |    }
  132|       |    if (bit != ref_bit) {
  133|       |      fprintf(stderr,
  134|       |              "\n *** [bit] symb error, frame_idx_r %d symb %d ref_symb %d "
  135|       |              "queue_r %d\n",
  136|       |              frame_idx, bit, ref_bit, queue_r);
  137|       |      assert(0);
  138|       |    }
  139|       |  }
  140|       |#endif
  141|       |
  142|       |#if CONFIG_ACCOUNTING
  143|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  144|       |  aom_update_symb_counts(r, 1);
  145|       |#endif
  146|   110M|  return bit;
  147|   110M|}
detokenize.c:aom_read_literal_:
  158|   196k|static inline int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) {
  159|   196k|  int literal = 0, bit;
  160|       |
  161|   513k|  for (bit = bits - 1; bit >= 0; bit--) literal |= aom_read_bit(r, NULL) << bit;
  ------------------
  |  |   43|   317k|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  |  Branch (161:24): [True: 317k, False: 196k]
  ------------------
  162|       |#if CONFIG_ACCOUNTING
  163|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  164|       |#endif
  165|   196k|  return literal;
  166|   196k|}
detokenize.c:aom_read_bit_:
  149|   317k|static inline int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) {
  150|   317k|  int ret;
  151|   317k|  ret = aom_read(r, 128, NULL);  // aom_prob_half
  ------------------
  |  |   41|   317k|  aom_read_(r, prob ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  152|       |#if CONFIG_ACCOUNTING
  153|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  154|       |#endif
  155|   317k|  return ret;
  156|   317k|}
detokenize.c:aom_read_:
  104|   317k|static inline int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
  105|   317k|  int p = (0x7FFFFF - (prob << 15) + prob) >> 8;
  106|   317k|  int bit = od_ec_decode_bool_q15(&r->ec, p);
  107|       |
  108|       |#if CONFIG_BITSTREAM_DEBUG
  109|       |  {
  110|       |    int i;
  111|       |    int ref_bit, ref_nsymbs;
  112|       |    aom_cdf_prob ref_cdf[16];
  113|       |    const int queue_r = bitstream_queue_get_read();
  114|       |    const int frame_idx = aom_bitstream_queue_get_frame_read();
  115|       |    bitstream_queue_pop(&ref_bit, ref_cdf, &ref_nsymbs);
  116|       |    if (ref_nsymbs != 2) {
  117|       |      fprintf(stderr,
  118|       |              "\n *** [bit] nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs "
  119|       |              "%d queue_r %d\n",
  120|       |              frame_idx, 2, ref_nsymbs, queue_r);
  121|       |      assert(0);
  122|       |    }
  123|       |    if ((ref_nsymbs != 2) || (ref_cdf[0] != (aom_cdf_prob)p) ||
  124|       |        (ref_cdf[1] != 32767)) {
  125|       |      fprintf(stderr,
  126|       |              "\n *** [bit] cdf error, frame_idx_r %d cdf {%d, %d} ref_cdf {%d",
  127|       |              frame_idx, p, 32767, ref_cdf[0]);
  128|       |      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
  129|       |      fprintf(stderr, "} queue_r %d\n", queue_r);
  130|       |      assert(0);
  131|       |    }
  132|       |    if (bit != ref_bit) {
  133|       |      fprintf(stderr,
  134|       |              "\n *** [bit] symb error, frame_idx_r %d symb %d ref_symb %d "
  135|       |              "queue_r %d\n",
  136|       |              frame_idx, bit, ref_bit, queue_r);
  137|       |      assert(0);
  138|       |    }
  139|       |  }
  140|       |#endif
  141|       |
  142|       |#if CONFIG_ACCOUNTING
  143|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  144|       |  aom_update_symb_counts(r, 1);
  145|       |#endif
  146|   317k|  return bit;
  147|   317k|}
detokenize.c:aom_read_symbol_:
  221|  26.1M|                                   int nsymbs ACCT_STR_PARAM) {
  222|  26.1M|  int ret;
  223|  26.1M|  ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME);
  ------------------
  |  |   49|  26.1M|  aom_read_cdf_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  224|  26.1M|  if (r->allow_update_cdf) update_cdf(cdf, ret, nsymbs);
  ------------------
  |  Branch (224:7): [True: 24.4M, False: 1.73M]
  ------------------
  225|  26.1M|  return ret;
  226|  26.1M|}
detokenize.c:aom_read_cdf_:
  169|  26.1M|                                int nsymbs ACCT_STR_PARAM) {
  170|  26.1M|  int symb;
  171|  26.1M|  assert(cdf != NULL);
  172|  26.1M|  symb = od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs);
  173|       |
  174|       |#if CONFIG_BITSTREAM_DEBUG
  175|       |  {
  176|       |    int i;
  177|       |    int cdf_error = 0;
  178|       |    int ref_symb, ref_nsymbs;
  179|       |    aom_cdf_prob ref_cdf[16];
  180|       |    const int queue_r = bitstream_queue_get_read();
  181|       |    const int frame_idx = aom_bitstream_queue_get_frame_read();
  182|       |    bitstream_queue_pop(&ref_symb, ref_cdf, &ref_nsymbs);
  183|       |    if (nsymbs != ref_nsymbs) {
  184|       |      fprintf(stderr,
  185|       |              "\n *** nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs %d "
  186|       |              "queue_r %d\n",
  187|       |              frame_idx, nsymbs, ref_nsymbs, queue_r);
  188|       |      cdf_error = 0;
  189|       |      assert(0);
  190|       |    } else {
  191|       |      for (i = 0; i < nsymbs; ++i)
  192|       |        if (cdf[i] != ref_cdf[i]) cdf_error = 1;
  193|       |    }
  194|       |    if (cdf_error) {
  195|       |      fprintf(stderr, "\n *** cdf error, frame_idx_r %d cdf {%d", frame_idx,
  196|       |              cdf[0]);
  197|       |      for (i = 1; i < nsymbs; ++i) fprintf(stderr, ", %d", cdf[i]);
  198|       |      fprintf(stderr, "} ref_cdf {%d", ref_cdf[0]);
  199|       |      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
  200|       |      fprintf(stderr, "} queue_r %d\n", queue_r);
  201|       |      assert(0);
  202|       |    }
  203|       |    if (symb != ref_symb) {
  204|       |      fprintf(
  205|       |          stderr,
  206|       |          "\n *** symb error, frame_idx_r %d symb %d ref_symb %d queue_r %d\n",
  207|       |          frame_idx, symb, ref_symb, queue_r);
  208|       |      assert(0);
  209|       |    }
  210|       |  }
  211|       |#endif
  212|       |
  213|       |#if CONFIG_ACCOUNTING
  214|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  215|       |  aom_update_symb_counts(r, (nsymbs == 2));
  216|       |#endif
  217|  26.1M|  return symb;
  218|  26.1M|}
binary_codes_reader.c:aom_read_bit_:
  149|  2.00M|static inline int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) {
  150|  2.00M|  int ret;
  151|  2.00M|  ret = aom_read(r, 128, NULL);  // aom_prob_half
  ------------------
  |  |   41|  2.00M|  aom_read_(r, prob ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  152|       |#if CONFIG_ACCOUNTING
  153|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  154|       |#endif
  155|  2.00M|  return ret;
  156|  2.00M|}
binary_codes_reader.c:aom_read_:
  104|  2.00M|static inline int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
  105|  2.00M|  int p = (0x7FFFFF - (prob << 15) + prob) >> 8;
  106|  2.00M|  int bit = od_ec_decode_bool_q15(&r->ec, p);
  107|       |
  108|       |#if CONFIG_BITSTREAM_DEBUG
  109|       |  {
  110|       |    int i;
  111|       |    int ref_bit, ref_nsymbs;
  112|       |    aom_cdf_prob ref_cdf[16];
  113|       |    const int queue_r = bitstream_queue_get_read();
  114|       |    const int frame_idx = aom_bitstream_queue_get_frame_read();
  115|       |    bitstream_queue_pop(&ref_bit, ref_cdf, &ref_nsymbs);
  116|       |    if (ref_nsymbs != 2) {
  117|       |      fprintf(stderr,
  118|       |              "\n *** [bit] nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs "
  119|       |              "%d queue_r %d\n",
  120|       |              frame_idx, 2, ref_nsymbs, queue_r);
  121|       |      assert(0);
  122|       |    }
  123|       |    if ((ref_nsymbs != 2) || (ref_cdf[0] != (aom_cdf_prob)p) ||
  124|       |        (ref_cdf[1] != 32767)) {
  125|       |      fprintf(stderr,
  126|       |              "\n *** [bit] cdf error, frame_idx_r %d cdf {%d, %d} ref_cdf {%d",
  127|       |              frame_idx, p, 32767, ref_cdf[0]);
  128|       |      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
  129|       |      fprintf(stderr, "} queue_r %d\n", queue_r);
  130|       |      assert(0);
  131|       |    }
  132|       |    if (bit != ref_bit) {
  133|       |      fprintf(stderr,
  134|       |              "\n *** [bit] symb error, frame_idx_r %d symb %d ref_symb %d "
  135|       |              "queue_r %d\n",
  136|       |              frame_idx, bit, ref_bit, queue_r);
  137|       |      assert(0);
  138|       |    }
  139|       |  }
  140|       |#endif
  141|       |
  142|       |#if CONFIG_ACCOUNTING
  143|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  144|       |  aom_update_symb_counts(r, 1);
  145|       |#endif
  146|  2.00M|  return bit;
  147|  2.00M|}
binary_codes_reader.c:aom_read_literal_:
  158|   384k|static inline int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) {
  159|   384k|  int literal = 0, bit;
  160|       |
  161|  1.76M|  for (bit = bits - 1; bit >= 0; bit--) literal |= aom_read_bit(r, NULL) << bit;
  ------------------
  |  |   43|  1.37M|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  |  Branch (161:24): [True: 1.37M, False: 384k]
  ------------------
  162|       |#if CONFIG_ACCOUNTING
  163|       |  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
  164|       |#endif
  165|   384k|  return literal;
  166|   384k|}

aom_rb_bytes_read:
   20|   332k|size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb) {
   21|   332k|  return (rb->bit_offset + 7) >> 3;
   22|   332k|}
aom_rb_read_bit:
   24|  63.4M|int aom_rb_read_bit(struct aom_read_bit_buffer *rb) {
   25|  63.4M|  const uint32_t off = rb->bit_offset;
   26|  63.4M|  const uint32_t p = off >> 3;
   27|  63.4M|  const int q = 7 - (int)(off & 0x7);
   28|  63.4M|  if (rb->bit_buffer + p < rb->bit_buffer_end) {
  ------------------
  |  Branch (28:7): [True: 63.0M, False: 347k]
  ------------------
   29|  63.0M|    const int bit = (rb->bit_buffer[p] >> q) & 1;
   30|  63.0M|    rb->bit_offset = off + 1;
   31|  63.0M|    return bit;
   32|  63.0M|  } else {
   33|   347k|    if (rb->error_handler) rb->error_handler(rb->error_handler_data);
  ------------------
  |  Branch (33:9): [True: 14.6k, False: 332k]
  ------------------
   34|   347k|    return 0;
   35|   347k|  }
   36|  63.4M|}
aom_rb_read_literal:
   38|  8.66M|int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) {
   39|  8.66M|  assert(bits <= 31);
   40|  8.66M|  int value = 0, bit;
   41|  52.8M|  for (bit = bits - 1; bit >= 0; bit--) value |= aom_rb_read_bit(rb) << bit;
  ------------------
  |  Branch (41:24): [True: 44.1M, False: 8.66M]
  ------------------
   42|  8.66M|  return value;
   43|  8.66M|}
aom_rb_read_unsigned_literal:
   47|   123k|                                      int bits) {
   48|   123k|  assert(bits <= 32);
   49|   123k|  uint32_t value = 0;
   50|   123k|  int bit;
   51|  2.92M|  for (bit = bits - 1; bit >= 0; bit--)
  ------------------
  |  Branch (51:24): [True: 2.79M, False: 123k]
  ------------------
   52|  2.79M|    value |= (uint32_t)aom_rb_read_bit(rb) << bit;
   53|   123k|  return value;
   54|   123k|}
aom_rb_read_inv_signed_literal:
   56|   370k|int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits) {
   57|   370k|  const int nbits = sizeof(unsigned) * 8 - bits - 1;
   58|   370k|  const unsigned value = (unsigned)aom_rb_read_literal(rb, bits + 1) << nbits;
   59|   370k|  return ((int)value) >> nbits;
   60|   370k|}
aom_rb_read_uvlc:
   63|  15.7k|uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb) {
   64|  15.7k|  int leading_zeros = 0;
   65|   127k|  while (leading_zeros < 32 && !aom_rb_read_bit(rb)) ++leading_zeros;
  ------------------
  |  Branch (65:10): [True: 127k, False: 129]
  |  Branch (65:32): [True: 111k, False: 15.5k]
  ------------------
   66|       |  // Maximum 32 bits.
   67|  15.7k|  if (leading_zeros == 32) return UINT32_MAX;
  ------------------
  |  Branch (67:7): [True: 129, False: 15.5k]
  ------------------
   68|  15.5k|  const uint32_t base = (1u << leading_zeros) - 1;
   69|  15.5k|  const uint32_t value = aom_rb_read_literal(rb, leading_zeros);
   70|  15.5k|  return base + value;
   71|  15.7k|}
aom_rb_read_signed_primitive_refsubexpfin:
  115|   116k|    struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref) {
  116|   116k|  ref += n - 1;
  117|   116k|  const uint16_t scaled_n = (n << 1) - 1;
  118|   116k|  return aom_rb_read_primitive_refsubexpfin(rb, scaled_n, k, ref) - n + 1;
  119|   116k|}
bitreader_buffer.c:aom_rb_read_primitive_refsubexpfin:
  109|   116k|    struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, uint16_t ref) {
  110|   116k|  return inv_recenter_finite_nonneg(n, ref,
  111|   116k|                                    aom_rb_read_primitive_subexpfin(rb, n, k));
  112|   116k|}
bitreader_buffer.c:aom_rb_read_primitive_subexpfin:
   84|   116k|                                                uint16_t n, uint16_t k) {
   85|   116k|  int i = 0;
   86|   116k|  int mk = 0;
   87|       |
   88|   494k|  while (1) {
  ------------------
  |  Branch (88:10): [Folded - Ignored]
  ------------------
   89|   494k|    int b = (i ? k + i - 1 : k);
  ------------------
  |  Branch (89:14): [True: 377k, False: 116k]
  ------------------
   90|   494k|    int a = (1 << b);
   91|       |
   92|   494k|    if (n <= mk + 3 * a) {
  ------------------
  |  Branch (92:9): [True: 29.9k, False: 464k]
  ------------------
   93|  29.9k|      return aom_rb_read_primitive_quniform(rb, n - mk) + mk;
   94|  29.9k|    }
   95|       |
   96|   464k|    if (!aom_rb_read_bit(rb)) {
  ------------------
  |  Branch (96:9): [True: 86.2k, False: 378k]
  ------------------
   97|  86.2k|      return aom_rb_read_literal(rb, b) + mk;
   98|  86.2k|    }
   99|       |
  100|   378k|    i = i + 1;
  101|   378k|    mk += a;
  102|   378k|  }
  103|       |
  104|    186|  assert(0);
  105|      0|  return 0;
  106|    186|}
bitreader_buffer.c:aom_rb_read_primitive_quniform:
   75|  29.9k|                                               uint16_t n) {
   76|  29.9k|  if (n <= 1) return 0;
  ------------------
  |  Branch (76:7): [True: 0, False: 29.9k]
  ------------------
   77|  29.9k|  const int l = get_msb(n) + 1;
   78|  29.9k|  const int m = (1 << l) - n;
   79|  29.9k|  const int v = aom_rb_read_literal(rb, l - 1);
   80|  29.9k|  return v < m ? v : (v << 1) - m + aom_rb_read_bit(rb);
  ------------------
  |  Branch (80:10): [True: 11.2k, False: 18.6k]
  ------------------
   81|  29.9k|}

aom_highbd_blend_a64_d16_mask_c:
  128|    122|    ConvolveParams *conv_params, const int bd) {
  129|    122|  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
  ------------------
  |  |   21|    122|#define FILTER_BITS 7
  ------------------
  130|    122|  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
  131|    122|                           (1 << (offset_bits - conv_params->round_1 - 1));
  132|    122|  const int round_bits =
  133|    122|      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|    122|#define FILTER_BITS 7
  ------------------
  134|    122|  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
  ------------------
  |  |   75|    122|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  135|       |
  136|    122|  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
  137|    122|  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
  138|       |
  139|    122|  assert(h >= 1);
  140|    122|  assert(w >= 1);
  141|    122|  assert(IS_POWER_OF_TWO(h));
  142|    122|  assert(IS_POWER_OF_TWO(w));
  143|       |
  144|       |  // excerpt from clip_pixel_highbd()
  145|       |  // set saturation_value to (1 << bd) - 1
  146|    122|  unsigned int saturation_value;
  147|    122|  switch (bd) {
  148|      0|    case 8:
  ------------------
  |  Branch (148:5): [True: 0, False: 122]
  ------------------
  149|      0|    default: saturation_value = 255; break;
  ------------------
  |  Branch (149:5): [True: 0, False: 122]
  ------------------
  150|     56|    case 10: saturation_value = 1023; break;
  ------------------
  |  Branch (150:5): [True: 56, False: 66]
  ------------------
  151|     66|    case 12: saturation_value = 4095; break;
  ------------------
  |  Branch (151:5): [True: 66, False: 56]
  ------------------
  152|    122|  }
  153|       |
  154|    122|  if (subw == 0 && subh == 0) {
  ------------------
  |  Branch (154:7): [True: 0, False: 122]
  |  Branch (154:20): [True: 0, False: 0]
  ------------------
  155|      0|    for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (155:21): [True: 0, False: 0]
  ------------------
  156|      0|      for (int j = 0; j < w; ++j) {
  ------------------
  |  Branch (156:23): [True: 0, False: 0]
  ------------------
  157|      0|        int32_t res;
  158|      0|        const int m = mask[j];
  159|      0|        res = ((m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
  ------------------
  |  |   24|      0|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|      0|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  160|      0|               AOM_BLEND_A64_ROUND_BITS);
  ------------------
  |  |   23|      0|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  161|      0|        res -= round_offset;
  162|      0|        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
  ------------------
  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  163|      0|        dst[j] = AOMMIN(v, saturation_value);
  ------------------
  |  |   34|      0|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  164|      0|      }
  165|      0|      mask += mask_stride;
  166|      0|      src0 += src0_stride;
  167|      0|      src1 += src1_stride;
  168|      0|      dst += dst_stride;
  169|      0|    }
  170|    122|  } else if (subw == 1 && subh == 1) {
  ------------------
  |  Branch (170:14): [True: 122, False: 0]
  |  Branch (170:27): [True: 0, False: 122]
  ------------------
  171|      0|    for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (171:21): [True: 0, False: 0]
  ------------------
  172|      0|      for (int j = 0; j < w; ++j) {
  ------------------
  |  Branch (172:23): [True: 0, False: 0]
  ------------------
  173|      0|        int32_t res;
  174|      0|        const int m = ROUND_POWER_OF_TWO(
  ------------------
  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  175|      0|            mask[2 * j] + mask[mask_stride + 2 * j] + mask[2 * j + 1] +
  176|      0|                mask[mask_stride + 2 * j + 1],
  177|      0|            2);
  178|      0|        res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
  ------------------
  |  |   24|      0|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|      0|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  179|      0|              AOM_BLEND_A64_ROUND_BITS;
  ------------------
  |  |   23|      0|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  180|      0|        res -= round_offset;
  181|      0|        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
  ------------------
  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  182|      0|        dst[j] = AOMMIN(v, saturation_value);
  ------------------
  |  |   34|      0|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  183|      0|      }
  184|      0|      mask += 2 * mask_stride;
  185|      0|      src0 += src0_stride;
  186|      0|      src1 += src1_stride;
  187|      0|      dst += dst_stride;
  188|      0|    }
  189|    122|  } else if (subw == 1 && subh == 0) {
  ------------------
  |  Branch (189:14): [True: 122, False: 0]
  |  Branch (189:27): [True: 122, False: 0]
  ------------------
  190|  2.10k|    for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (190:21): [True: 1.98k, False: 122]
  ------------------
  191|  56.8k|      for (int j = 0; j < w; ++j) {
  ------------------
  |  Branch (191:23): [True: 54.9k, False: 1.98k]
  ------------------
  192|  54.9k|        int32_t res;
  193|  54.9k|        const int m = AOM_BLEND_AVG(mask[2 * j], mask[2 * j + 1]);
  ------------------
  |  |   40|  54.9k|#define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1)
  |  |  ------------------
  |  |  |  |   41|  54.9k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  194|  54.9k|        res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
  ------------------
  |  |   24|  54.9k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|  54.9k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  195|  54.9k|              AOM_BLEND_A64_ROUND_BITS;
  ------------------
  |  |   23|  54.9k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  196|  54.9k|        res -= round_offset;
  197|  54.9k|        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
  ------------------
  |  |   41|  54.9k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  198|  54.9k|        dst[j] = AOMMIN(v, saturation_value);
  ------------------
  |  |   34|  54.9k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 54.9k, False: 0]
  |  |  ------------------
  ------------------
  199|  54.9k|      }
  200|  1.98k|      mask += mask_stride;
  201|  1.98k|      src0 += src0_stride;
  202|  1.98k|      src1 += src1_stride;
  203|  1.98k|      dst += dst_stride;
  204|  1.98k|    }
  205|    122|  } else {
  206|      0|    for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (206:21): [True: 0, False: 0]
  ------------------
  207|      0|      for (int j = 0; j < w; ++j) {
  ------------------
  |  Branch (207:23): [True: 0, False: 0]
  ------------------
  208|      0|        int32_t res;
  209|      0|        const int m = AOM_BLEND_AVG(mask[j], mask[mask_stride + j]);
  ------------------
  |  |   40|      0|#define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1)
  |  |  ------------------
  |  |  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  210|      0|        res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
  ------------------
  |  |   24|      0|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|      0|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  211|      0|              AOM_BLEND_A64_ROUND_BITS;
  ------------------
  |  |   23|      0|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  212|      0|        res -= round_offset;
  213|      0|        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
  ------------------
  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  214|      0|        dst[j] = AOMMIN(v, saturation_value);
  ------------------
  |  |   34|      0|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  215|      0|      }
  216|      0|      mask += 2 * mask_stride;
  217|      0|      src0 += src0_stride;
  218|      0|      src1 += src1_stride;
  219|      0|      dst += dst_stride;
  220|      0|    }
  221|      0|  }
  222|    122|}
aom_blend_a64_mask_c:
  233|   311k|                          int h, int subw, int subh) {
  234|   311k|  int i, j;
  235|       |
  236|   311k|  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
  237|   311k|  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
  238|       |
  239|   311k|  assert(h >= 1);
  240|   311k|  assert(w >= 1);
  241|   311k|  assert(IS_POWER_OF_TWO(h));
  242|   311k|  assert(IS_POWER_OF_TWO(w));
  243|       |
  244|   311k|  if (subw == 0 && subh == 0) {
  ------------------
  |  Branch (244:7): [True: 311k, False: 18.4E]
  |  Branch (244:20): [True: 311k, False: 18.4E]
  ------------------
  245|  1.99M|    for (i = 0; i < h; ++i) {
  ------------------
  |  Branch (245:17): [True: 1.68M, False: 311k]
  ------------------
  246|  5.05M|      for (j = 0; j < w; ++j) {
  ------------------
  |  Branch (246:19): [True: 3.37M, False: 1.68M]
  ------------------
  247|  3.37M|        const int m = mask[i * mask_stride + j];
  248|  3.37M|        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
  ------------------
  |  |   27|  3.37M|  ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
  |  |  ------------------
  |  |  |  |   41|  3.37M|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |   28|  3.37M|                     AOM_BLEND_A64_ROUND_BITS)
  ------------------
  249|  3.37M|                                                src1[i * src1_stride + j]);
  250|  3.37M|      }
  251|  1.68M|    }
  252|  18.4E|  } else if (subw == 1 && subh == 1) {
  ------------------
  |  Branch (252:14): [True: 0, False: 18.4E]
  |  Branch (252:27): [True: 0, False: 0]
  ------------------
  253|      0|    for (i = 0; i < h; ++i) {
  ------------------
  |  Branch (253:17): [True: 0, False: 0]
  ------------------
  254|      0|      for (j = 0; j < w; ++j) {
  ------------------
  |  Branch (254:19): [True: 0, False: 0]
  ------------------
  255|      0|        const int m = ROUND_POWER_OF_TWO(
  ------------------
  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  256|      0|            mask[(2 * i) * mask_stride + (2 * j)] +
  257|      0|                mask[(2 * i + 1) * mask_stride + (2 * j)] +
  258|      0|                mask[(2 * i) * mask_stride + (2 * j + 1)] +
  259|      0|                mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
  260|      0|            2);
  261|      0|        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
  ------------------
  |  |   27|      0|  ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
  |  |  ------------------
  |  |  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |   28|      0|                     AOM_BLEND_A64_ROUND_BITS)
  ------------------
  262|      0|                                                src1[i * src1_stride + j]);
  263|      0|      }
  264|      0|    }
  265|  18.4E|  } else if (subw == 1 && subh == 0) {
  ------------------
  |  Branch (265:14): [True: 0, False: 18.4E]
  |  Branch (265:27): [True: 0, False: 0]
  ------------------
  266|      0|    for (i = 0; i < h; ++i) {
  ------------------
  |  Branch (266:17): [True: 0, False: 0]
  ------------------
  267|      0|      for (j = 0; j < w; ++j) {
  ------------------
  |  Branch (267:19): [True: 0, False: 0]
  ------------------
  268|      0|        const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
  ------------------
  |  |   40|      0|#define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1)
  |  |  ------------------
  |  |  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  269|      0|                                    mask[i * mask_stride + (2 * j + 1)]);
  270|      0|        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
  ------------------
  |  |   27|      0|  ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
  |  |  ------------------
  |  |  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |   28|      0|                     AOM_BLEND_A64_ROUND_BITS)
  ------------------
  271|      0|                                                src1[i * src1_stride + j]);
  272|      0|      }
  273|      0|    }
  274|  18.4E|  } else {
  275|  18.4E|    for (i = 0; i < h; ++i) {
  ------------------
  |  Branch (275:17): [True: 0, False: 18.4E]
  ------------------
  276|      0|      for (j = 0; j < w; ++j) {
  ------------------
  |  Branch (276:19): [True: 0, False: 0]
  ------------------
  277|      0|        const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
  ------------------
  |  |   40|      0|#define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1)
  |  |  ------------------
  |  |  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  278|      0|                                    mask[(2 * i + 1) * mask_stride + j]);
  279|      0|        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
  ------------------
  |  |   27|      0|  ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
  |  |  ------------------
  |  |  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |   28|      0|                     AOM_BLEND_A64_ROUND_BITS)
  ------------------
  280|      0|                                                src1[i * src1_stride + j]);
  281|      0|      }
  282|      0|    }
  283|  18.4E|  }
  284|   311k|}
aom_highbd_blend_a64_mask_c:
  291|   213k|                                 int w, int h, int subw, int subh, int bd) {
  292|   213k|  int i, j;
  293|   213k|  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
  ------------------
  |  |   75|   213k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  294|   213k|  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
  ------------------
  |  |   75|   213k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  295|   213k|  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
  ------------------
  |  |   75|   213k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  296|   213k|  (void)bd;
  297|       |
  298|   213k|  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
  299|   213k|  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
  300|       |
  301|   213k|  assert(h >= 1);
  302|   213k|  assert(w >= 1);
  303|   213k|  assert(IS_POWER_OF_TWO(h));
  304|   213k|  assert(IS_POWER_OF_TWO(w));
  305|       |
  306|   213k|  assert(bd == 8 || bd == 10 || bd == 12);
  307|       |
  308|   213k|  if (subw == 0 && subh == 0) {
  ------------------
  |  Branch (308:7): [True: 213k, False: 0]
  |  Branch (308:20): [True: 213k, False: 0]
  ------------------
  309|  1.31M|    for (i = 0; i < h; ++i) {
  ------------------
  |  Branch (309:17): [True: 1.10M, False: 213k]
  ------------------
  310|  3.31M|      for (j = 0; j < w; ++j) {
  ------------------
  |  Branch (310:19): [True: 2.21M, False: 1.10M]
  ------------------
  311|  2.21M|        const int m = mask[i * mask_stride + j];
  312|  2.21M|        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
  ------------------
  |  |   27|  2.21M|  ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
  |  |  ------------------
  |  |  |  |   41|  2.21M|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |   28|  2.21M|                     AOM_BLEND_A64_ROUND_BITS)
  ------------------
  313|  2.21M|                                                src1[i * src1_stride + j]);
  314|  2.21M|      }
  315|  1.10M|    }
  316|   213k|  } else if (subw == 1 && subh == 1) {
  ------------------
  |  Branch (316:14): [True: 0, False: 0]
  |  Branch (316:27): [True: 0, False: 0]
  ------------------
  317|      0|    for (i = 0; i < h; ++i) {
  ------------------
  |  Branch (317:17): [True: 0, False: 0]
  ------------------
  318|      0|      for (j = 0; j < w; ++j) {
  ------------------
  |  Branch (318:19): [True: 0, False: 0]
  ------------------
  319|      0|        const int m = ROUND_POWER_OF_TWO(
  ------------------
  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  320|      0|            mask[(2 * i) * mask_stride + (2 * j)] +
  321|      0|                mask[(2 * i + 1) * mask_stride + (2 * j)] +
  322|      0|                mask[(2 * i) * mask_stride + (2 * j + 1)] +
  323|      0|                mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
  324|      0|            2);
  325|      0|        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
  ------------------
  |  |   27|      0|  ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
  |  |  ------------------
  |  |  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |   28|      0|                     AOM_BLEND_A64_ROUND_BITS)
  ------------------
  326|      0|                                                src1[i * src1_stride + j]);
  327|      0|      }
  328|      0|    }
  329|      0|  } else if (subw == 1 && subh == 0) {
  ------------------
  |  Branch (329:14): [True: 0, False: 0]
  |  Branch (329:27): [True: 0, False: 0]
  ------------------
  330|      0|    for (i = 0; i < h; ++i) {
  ------------------
  |  Branch (330:17): [True: 0, False: 0]
  ------------------
  331|      0|      for (j = 0; j < w; ++j) {
  ------------------
  |  Branch (331:19): [True: 0, False: 0]
  ------------------
  332|      0|        const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
  ------------------
  |  |   40|      0|#define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1)
  |  |  ------------------
  |  |  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  333|      0|                                    mask[i * mask_stride + (2 * j + 1)]);
  334|      0|        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
  ------------------
  |  |   27|      0|  ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
  |  |  ------------------
  |  |  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |   28|      0|                     AOM_BLEND_A64_ROUND_BITS)
  ------------------
  335|      0|                                                src1[i * src1_stride + j]);
  336|      0|      }
  337|      0|    }
  338|      0|  } else {
  339|      0|    for (i = 0; i < h; ++i) {
  ------------------
  |  Branch (339:17): [True: 0, False: 0]
  ------------------
  340|      0|      for (j = 0; j < w; ++j) {
  ------------------
  |  Branch (340:19): [True: 0, False: 0]
  ------------------
  341|      0|        const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
  ------------------
  |  |   40|      0|#define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1)
  |  |  ------------------
  |  |  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  342|      0|                                    mask[(2 * i + 1) * mask_stride + j]);
  343|      0|        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
  ------------------
  |  |   27|      0|  ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
  |  |  ------------------
  |  |  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |   28|      0|                     AOM_BLEND_A64_ROUND_BITS)
  ------------------
  344|      0|                                                src1[i * src1_stride + j]);
  345|      0|      }
  346|      0|    }
  347|      0|  }
  348|   213k|}

aom_highbd_blend_a64_vmask_c:
   48|  25.7k|                                  const uint8_t *mask, int w, int h, int bd) {
   49|  25.7k|  int i, j;
   50|  25.7k|  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
  ------------------
  |  |   75|  25.7k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
   51|  25.7k|  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
  ------------------
  |  |   75|  25.7k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
   52|  25.7k|  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
  ------------------
  |  |   75|  25.7k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
   53|  25.7k|  (void)bd;
   54|       |
   55|  25.7k|  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
   56|  25.7k|  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
   57|       |
   58|  25.7k|  assert(h >= 1);
   59|  25.7k|  assert(w >= 1);
   60|  25.7k|  assert(IS_POWER_OF_TWO(h));
   61|  25.7k|  assert(IS_POWER_OF_TWO(w));
   62|       |
   63|  25.7k|  assert(bd == 8 || bd == 10 || bd == 12);
   64|       |
   65|  77.2k|  for (i = 0; i < h; ++i) {
  ------------------
  |  Branch (65:15): [True: 51.4k, False: 25.7k]
  ------------------
   66|  51.4k|    const int m = mask[i];
   67|   818k|    for (j = 0; j < w; ++j) {
  ------------------
  |  Branch (67:17): [True: 766k, False: 51.4k]
  ------------------
   68|   766k|      dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
  ------------------
  |  |   27|   766k|  ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
  |  |  ------------------
  |  |  |  |   41|   766k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |   28|   766k|                     AOM_BLEND_A64_ROUND_BITS)
  ------------------
   69|   766k|                                              src1[i * src1_stride + j]);
   70|   766k|    }
   71|  51.4k|  }
   72|  25.7k|}

od_ec_dec_init:
  144|   162k|                    uint32_t storage) {
  145|   162k|  dec->buf = buf;
  146|   162k|  dec->tell_offs = 10 - (OD_EC_WINDOW_SIZE - 8);
  ------------------
  |  |   28|   162k|#define OD_EC_WINDOW_SIZE ((int)sizeof(od_ec_window) * CHAR_BIT)
  ------------------
  147|   162k|  dec->end = buf + storage;
  148|   162k|  dec->bptr = buf;
  149|   162k|  dec->dif = ((od_ec_window)1 << (OD_EC_WINDOW_SIZE - 1)) - 1;
  ------------------
  |  |   28|   162k|#define OD_EC_WINDOW_SIZE ((int)sizeof(od_ec_window) * CHAR_BIT)
  ------------------
  150|   162k|  dec->rng = 0x8000;
  151|   162k|  dec->cnt = -15;
  152|   162k|  od_ec_dec_refill(dec);
  153|   162k|}
od_ec_decode_bool_q15:
  158|   122M|int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) {
  159|   122M|  od_ec_window dif;
  160|   122M|  od_ec_window vw;
  161|   122M|  unsigned r;
  162|   122M|  unsigned r_new;
  163|   122M|  unsigned v;
  164|   122M|  int ret;
  165|   122M|  assert(0 < f);
  166|   122M|  assert(f < 32768U);
  167|   122M|  dif = dec->dif;
  168|   122M|  r = dec->rng;
  169|   122M|  assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
  170|   122M|  assert(32768U <= r);
  171|   122M|  v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT));
  ------------------
  |  |   20|   122M|#define EC_PROB_SHIFT 6
  ------------------
                v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT));
  ------------------
  |  |   20|   122M|#define EC_PROB_SHIFT 6
  ------------------
  172|   122M|  v += EC_MIN_PROB;
  ------------------
  |  |   21|   122M|#define EC_MIN_PROB 4  // must be <= (1<<EC_PROB_SHIFT)/16
  ------------------
  173|   122M|  vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
  ------------------
  |  |   28|   122M|#define OD_EC_WINDOW_SIZE ((int)sizeof(od_ec_window) * CHAR_BIT)
  ------------------
  174|   122M|  ret = 1;
  175|   122M|  r_new = v;
  176|   122M|  if (dif >= vw) {
  ------------------
  |  Branch (176:7): [True: 62.7M, False: 60.1M]
  ------------------
  177|  62.7M|    r_new = r - v;
  178|  62.7M|    dif -= vw;
  179|  62.7M|    ret = 0;
  180|  62.7M|  }
  181|   122M|  return od_ec_dec_normalize(dec, dif, r_new, ret);
  182|   122M|}
od_ec_decode_cdf_q15:
  193|   520M|int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *icdf, int nsyms) {
  194|   520M|  od_ec_window dif;
  195|   520M|  unsigned r;
  196|   520M|  unsigned c;
  197|   520M|  unsigned u;
  198|   520M|  unsigned v;
  199|   520M|  int ret;
  200|   520M|  (void)nsyms;
  201|   520M|  dif = dec->dif;
  202|   520M|  r = dec->rng;
  203|   520M|  const int N = nsyms - 1;
  204|       |
  205|   520M|  assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
  206|   520M|  assert(icdf[nsyms - 1] == OD_ICDF(CDF_PROB_TOP));
  207|   521M|  assert(32768U <= r);
  208|   520M|  assert(7 - EC_PROB_SHIFT >= 0);
  209|   520M|  c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16));
  ------------------
  |  |   28|   520M|#define OD_EC_WINDOW_SIZE ((int)sizeof(od_ec_window) * CHAR_BIT)
  ------------------
  210|   520M|  v = r;
  211|   520M|  ret = -1;
  212|  1.00G|  do {
  213|  1.00G|    u = v;
  214|  1.00G|    v = ((r >> 8) * (uint32_t)(icdf[++ret] >> EC_PROB_SHIFT) >>
  ------------------
  |  |   20|  1.00G|#define EC_PROB_SHIFT 6
  ------------------
  215|  1.00G|         (7 - EC_PROB_SHIFT));
  ------------------
  |  |   20|  1.00G|#define EC_PROB_SHIFT 6
  ------------------
  216|  1.00G|    v += EC_MIN_PROB * (N - ret);
  ------------------
  |  |   21|  1.00G|#define EC_MIN_PROB 4  // must be <= (1<<EC_PROB_SHIFT)/16
  ------------------
  217|  1.00G|  } while (c < v);
  ------------------
  |  Branch (217:12): [True: 480M, False: 520M]
  ------------------
  218|   520M|  assert(v < u);
  219|   521M|  assert(u <= r);
  220|   521M|  r = u - v;
  221|   521M|  dif -= (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
  ------------------
  |  |   28|   521M|#define OD_EC_WINDOW_SIZE ((int)sizeof(od_ec_window) * CHAR_BIT)
  ------------------
  222|   521M|  return od_ec_dec_normalize(dec, dif, r, ret);
  223|   521M|}
od_ec_dec_tell:
  231|  1.36M|int od_ec_dec_tell(const od_ec_dec *dec) {
  232|       |  /*There is a window of bits stored in dec->dif. The difference
  233|       |     (dec->bptr - dec->buf) tells us how many bytes have been read into this
  234|       |     window. The difference (dec->cnt - dec->tell_offs) tells us how many of
  235|       |     the bits in that window remain unconsumed.*/
  236|  1.36M|  return (int)((dec->bptr - dec->buf) * 8 - dec->cnt + dec->tell_offs);
  237|  1.36M|}
entdec.c:od_ec_dec_refill:
   78|  23.2M|static void od_ec_dec_refill(od_ec_dec *dec) {
   79|  23.2M|  int s;
   80|  23.2M|  od_ec_window dif;
   81|  23.2M|  int16_t cnt;
   82|  23.2M|  const unsigned char *bptr;
   83|  23.2M|  const unsigned char *end;
   84|  23.2M|  dif = dec->dif;
   85|  23.2M|  cnt = dec->cnt;
   86|  23.2M|  bptr = dec->bptr;
   87|  23.2M|  end = dec->end;
   88|  23.2M|  s = OD_EC_WINDOW_SIZE - 9 - (cnt + 15);
  ------------------
  |  |   28|  23.2M|#define OD_EC_WINDOW_SIZE ((int)sizeof(od_ec_window) * CHAR_BIT)
  ------------------
   89|  69.8M|  for (; s >= 0 && bptr < end; s -= 8, bptr++) {
  ------------------
  |  Branch (89:10): [True: 46.6M, False: 23.1M]
  |  Branch (89:20): [True: 46.6M, False: 81.2k]
  ------------------
   90|       |    /*Each time a byte is inserted into the window (dif), bptr advances and cnt
   91|       |       is incremented by 8, so the total number of consumed bits (the return
   92|       |       value of od_ec_dec_tell) does not change.*/
   93|  46.6M|    assert(s <= OD_EC_WINDOW_SIZE - 8);
   94|  46.6M|    dif ^= (od_ec_window)bptr[0] << s;
   95|  46.6M|    cnt += 8;
   96|  46.6M|  }
   97|  23.2M|  if (bptr >= end) {
  ------------------
  |  Branch (97:7): [True: 151k, False: 23.0M]
  ------------------
   98|       |    /*We've reached the end of the buffer. It is perfectly valid for us to need
   99|       |       to fill the window with additional bits past the end of the buffer (and
  100|       |       this happens in normal operation). These bits should all just be taken
  101|       |       as zero. But we cannot increment bptr past 'end' (this is undefined
  102|       |       behavior), so we start to increment dec->tell_offs. We also don't want
  103|       |       to keep testing bptr against 'end', so we set cnt to OD_EC_LOTS_OF_BITS
  104|       |       and adjust dec->tell_offs so that the total number of unconsumed bits in
  105|       |       the window (dec->cnt - dec->tell_offs) does not change. This effectively
  106|       |       puts lots of zero bits into the window, and means we won't try to refill
  107|       |       it from the buffer for a very long time (at which point we'll put lots
  108|       |       of zero bits into the window again).*/
  109|   151k|    dec->tell_offs += OD_EC_LOTS_OF_BITS - cnt;
  ------------------
  |  |   74|   151k|#define OD_EC_LOTS_OF_BITS (0x4000)
  ------------------
  110|   151k|    cnt = OD_EC_LOTS_OF_BITS;
  ------------------
  |  |   74|   151k|#define OD_EC_LOTS_OF_BITS (0x4000)
  ------------------
  111|   151k|  }
  112|  23.2M|  dec->dif = dif;
  113|  23.2M|  dec->cnt = cnt;
  114|  23.2M|  dec->bptr = bptr;
  115|  23.2M|}
entdec.c:od_ec_dec_normalize:
  126|   637M|                               int ret) {
  127|   637M|  int d;
  128|   637M|  assert(rng <= 65535U);
  129|       |  /*The number of leading zeros in the 16-bit binary representation of rng.*/
  130|   638M|  d = 16 - OD_ILOG_NZ(rng);
  ------------------
  |  |   50|   638M|#define OD_ILOG_NZ(x) (1 + get_msb(x))
  ------------------
  131|       |  /*d bits in dec->dif are consumed.*/
  132|   638M|  dec->cnt -= d;
  133|       |  /*This is equivalent to shifting in 1's instead of 0's.*/
  134|   638M|  dec->dif = ((dif + 1) << d) - 1;
  135|   638M|  dec->rng = rng << d;
  136|   638M|  if (dec->cnt < 0) od_ec_dec_refill(dec);
  ------------------
  |  Branch (136:7): [True: 23.0M, False: 615M]
  ------------------
  137|   638M|  return ret;
  138|   637M|}

aom_highbd_dc_predictor_4x16_c:
  632|   255k|                                    int bd) {
  633|   255k|  highbd_dc_predictor_rect(dst, stride, 4, 16, above, left, bd, 2,
  634|   255k|                           HIGHBD_DC_MULTIPLIER_1X4);
  ------------------
  |  |  584|   255k|#define HIGHBD_DC_MULTIPLIER_1X4 0x6667
  ------------------
  635|   255k|}
aom_highbd_dc_predictor_16x4_c:
  639|   753k|                                    int bd) {
  640|   753k|  highbd_dc_predictor_rect(dst, stride, 16, 4, above, left, bd, 2,
  641|   753k|                           HIGHBD_DC_MULTIPLIER_1X4);
  ------------------
  |  |  584|   753k|#define HIGHBD_DC_MULTIPLIER_1X4 0x6667
  ------------------
  642|   753k|}
aom_highbd_dc_predictor_8x32_c:
  662|   131k|                                    int bd) {
  663|   131k|  highbd_dc_predictor_rect(dst, stride, 8, 32, above, left, bd, 3,
  664|   131k|                           HIGHBD_DC_MULTIPLIER_1X4);
  ------------------
  |  |  584|   131k|#define HIGHBD_DC_MULTIPLIER_1X4 0x6667
  ------------------
  665|   131k|}
aom_highbd_dc_predictor_32x8_c:
  669|   506k|                                    int bd) {
  670|   506k|  highbd_dc_predictor_rect(dst, stride, 32, 8, above, left, bd, 3,
  671|   506k|                           HIGHBD_DC_MULTIPLIER_1X4);
  ------------------
  |  |  584|   506k|#define HIGHBD_DC_MULTIPLIER_1X4 0x6667
  ------------------
  672|   506k|}
aom_highbd_dc_predictor_16x64_c:
  692|  21.1k|                                     const uint16_t *left, int bd) {
  693|  21.1k|  highbd_dc_predictor_rect(dst, stride, 16, 64, above, left, bd, 4,
  694|  21.1k|                           HIGHBD_DC_MULTIPLIER_1X4);
  ------------------
  |  |  584|  21.1k|#define HIGHBD_DC_MULTIPLIER_1X4 0x6667
  ------------------
  695|  21.1k|}
aom_highbd_dc_predictor_64x16_c:
  699|  58.9k|                                     const uint16_t *left, int bd) {
  700|  58.9k|  highbd_dc_predictor_rect(dst, stride, 64, 16, above, left, bd, 4,
  701|  58.9k|                           HIGHBD_DC_MULTIPLIER_1X4);
  ------------------
  |  |  584|  58.9k|#define HIGHBD_DC_MULTIPLIER_1X4 0x6667
  ------------------
  702|  58.9k|}
aom_highbd_dc_predictor_32x64_c:
  707|  10.1k|                                     const uint16_t *left, int bd) {
  708|  10.1k|  highbd_dc_predictor_rect(dst, stride, 32, 64, above, left, bd, 5,
  709|  10.1k|                           HIGHBD_DC_MULTIPLIER_1X2);
  ------------------
  |  |  581|  10.1k|#define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB
  ------------------
  710|  10.1k|}
aom_highbd_dc_predictor_64x32_c:
  714|  14.6k|                                     const uint16_t *left, int bd) {
  715|  14.6k|  highbd_dc_predictor_rect(dst, stride, 64, 32, above, left, bd, 5,
  716|  14.6k|                           HIGHBD_DC_MULTIPLIER_1X2);
  ------------------
  |  |  581|  14.6k|#define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB
  ------------------
  717|  14.6k|}
aom_highbd_v_predictor_64x64_c:
  737|  3.89k|      const uint16_t *left, int bd) {                                       \
  738|  3.89k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  3.89k|  }
aom_highbd_v_predictor_32x64_c:
  737|  1.71k|      const uint16_t *left, int bd) {                                       \
  738|  1.71k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  1.71k|  }
aom_highbd_v_predictor_64x32_c:
  737|    596|      const uint16_t *left, int bd) {                                       \
  738|    596|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    596|  }
aom_highbd_v_predictor_4x16_c:
  737|  14.9k|      const uint16_t *left, int bd) {                                       \
  738|  14.9k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  14.9k|  }
aom_highbd_v_predictor_16x4_c:
  737|  38.8k|      const uint16_t *left, int bd) {                                       \
  738|  38.8k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  38.8k|  }
aom_highbd_v_predictor_8x32_c:
  737|  10.5k|      const uint16_t *left, int bd) {                                       \
  738|  10.5k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  10.5k|  }
aom_highbd_v_predictor_32x8_c:
  737|  43.7k|      const uint16_t *left, int bd) {                                       \
  738|  43.7k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  43.7k|  }
aom_highbd_v_predictor_16x64_c:
  737|  2.75k|      const uint16_t *left, int bd) {                                       \
  738|  2.75k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  2.75k|  }
aom_highbd_v_predictor_64x16_c:
  737|  1.27k|      const uint16_t *left, int bd) {                                       \
  738|  1.27k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  1.27k|  }
aom_highbd_h_predictor_64x64_c:
  737|  19.6k|      const uint16_t *left, int bd) {                                       \
  738|  19.6k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  19.6k|  }
aom_highbd_h_predictor_32x64_c:
  737|  3.41k|      const uint16_t *left, int bd) {                                       \
  738|  3.41k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  3.41k|  }
aom_highbd_h_predictor_64x32_c:
  737|  12.9k|      const uint16_t *left, int bd) {                                       \
  738|  12.9k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  12.9k|  }
aom_highbd_h_predictor_4x16_c:
  737|  25.9k|      const uint16_t *left, int bd) {                                       \
  738|  25.9k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  25.9k|  }
aom_highbd_h_predictor_16x4_c:
  737|   184k|      const uint16_t *left, int bd) {                                       \
  738|   184k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|   184k|  }
aom_highbd_h_predictor_8x32_c:
  737|  7.50k|      const uint16_t *left, int bd) {                                       \
  738|  7.50k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  7.50k|  }
aom_highbd_h_predictor_32x8_c:
  737|   230k|      const uint16_t *left, int bd) {                                       \
  738|   230k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|   230k|  }
aom_highbd_h_predictor_16x64_c:
  737|  3.35k|      const uint16_t *left, int bd) {                                       \
  738|  3.35k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  3.35k|  }
aom_highbd_h_predictor_64x16_c:
  737|  65.5k|      const uint16_t *left, int bd) {                                       \
  738|  65.5k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  65.5k|  }
aom_highbd_smooth_predictor_4x4_c:
  737|   197k|      const uint16_t *left, int bd) {                                       \
  738|   197k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|   197k|  }
aom_highbd_smooth_predictor_8x8_c:
  737|   388k|      const uint16_t *left, int bd) {                                       \
  738|   388k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|   388k|  }
aom_highbd_smooth_predictor_16x16_c:
  737|   165k|      const uint16_t *left, int bd) {                                       \
  738|   165k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|   165k|  }
aom_highbd_smooth_predictor_32x32_c:
  737|   113k|      const uint16_t *left, int bd) {                                       \
  738|   113k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|   113k|  }
aom_highbd_smooth_predictor_64x64_c:
  737|  40.8k|      const uint16_t *left, int bd) {                                       \
  738|  40.8k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  40.8k|  }
aom_highbd_smooth_predictor_4x8_c:
  737|  65.7k|      const uint16_t *left, int bd) {                                       \
  738|  65.7k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  65.7k|  }
aom_highbd_smooth_predictor_8x4_c:
  737|   141k|      const uint16_t *left, int bd) {                                       \
  738|   141k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|   141k|  }
aom_highbd_smooth_predictor_8x16_c:
  737|  77.8k|      const uint16_t *left, int bd) {                                       \
  738|  77.8k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  77.8k|  }
aom_highbd_smooth_predictor_16x8_c:
  737|   180k|      const uint16_t *left, int bd) {                                       \
  738|   180k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|   180k|  }
aom_highbd_smooth_predictor_16x32_c:
  737|  38.1k|      const uint16_t *left, int bd) {                                       \
  738|  38.1k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  38.1k|  }
aom_highbd_smooth_predictor_32x16_c:
  737|  41.1k|      const uint16_t *left, int bd) {                                       \
  738|  41.1k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  41.1k|  }
aom_highbd_smooth_predictor_32x64_c:
  737|  3.23k|      const uint16_t *left, int bd) {                                       \
  738|  3.23k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  3.23k|  }
aom_highbd_smooth_predictor_64x32_c:
  737|  9.70k|      const uint16_t *left, int bd) {                                       \
  738|  9.70k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  9.70k|  }
aom_highbd_smooth_predictor_4x16_c:
  737|  75.5k|      const uint16_t *left, int bd) {                                       \
  738|  75.5k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  75.5k|  }
aom_highbd_smooth_predictor_16x4_c:
  737|   108k|      const uint16_t *left, int bd) {                                       \
  738|   108k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|   108k|  }
aom_highbd_smooth_predictor_8x32_c:
  737|  31.6k|      const uint16_t *left, int bd) {                                       \
  738|  31.6k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  31.6k|  }
aom_highbd_smooth_predictor_32x8_c:
  737|  61.6k|      const uint16_t *left, int bd) {                                       \
  738|  61.6k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  61.6k|  }
aom_highbd_smooth_predictor_16x64_c:
  737|  6.30k|      const uint16_t *left, int bd) {                                       \
  738|  6.30k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  6.30k|  }
aom_highbd_smooth_predictor_64x16_c:
  737|  25.6k|      const uint16_t *left, int bd) {                                       \
  738|  25.6k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  25.6k|  }
aom_highbd_smooth_v_predictor_4x4_c:
  737|  40.4k|      const uint16_t *left, int bd) {                                       \
  738|  40.4k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  40.4k|  }
aom_highbd_smooth_v_predictor_8x8_c:
  737|   103k|      const uint16_t *left, int bd) {                                       \
  738|   103k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|   103k|  }
aom_highbd_smooth_v_predictor_16x16_c:
  737|  44.2k|      const uint16_t *left, int bd) {                                       \
  738|  44.2k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  44.2k|  }
aom_highbd_smooth_v_predictor_32x32_c:
  737|  44.7k|      const uint16_t *left, int bd) {                                       \
  738|  44.7k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  44.7k|  }
aom_highbd_smooth_v_predictor_64x64_c:
  737|  6.99k|      const uint16_t *left, int bd) {                                       \
  738|  6.99k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  6.99k|  }
aom_highbd_smooth_v_predictor_4x8_c:
  737|  17.0k|      const uint16_t *left, int bd) {                                       \
  738|  17.0k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  17.0k|  }
aom_highbd_smooth_v_predictor_8x4_c:
  737|  59.8k|      const uint16_t *left, int bd) {                                       \
  738|  59.8k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  59.8k|  }
aom_highbd_smooth_v_predictor_8x16_c:
  737|  30.7k|      const uint16_t *left, int bd) {                                       \
  738|  30.7k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  30.7k|  }
aom_highbd_smooth_v_predictor_16x8_c:
  737|  50.8k|      const uint16_t *left, int bd) {                                       \
  738|  50.8k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  50.8k|  }
aom_highbd_smooth_v_predictor_16x32_c:
  737|  10.7k|      const uint16_t *left, int bd) {                                       \
  738|  10.7k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  10.7k|  }
aom_highbd_smooth_v_predictor_32x16_c:
  737|  11.8k|      const uint16_t *left, int bd) {                                       \
  738|  11.8k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  11.8k|  }
aom_highbd_smooth_v_predictor_32x64_c:
  737|  1.36k|      const uint16_t *left, int bd) {                                       \
  738|  1.36k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  1.36k|  }
aom_highbd_smooth_v_predictor_64x32_c:
  737|  5.20k|      const uint16_t *left, int bd) {                                       \
  738|  5.20k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  5.20k|  }
aom_highbd_smooth_v_predictor_4x16_c:
  737|  24.7k|      const uint16_t *left, int bd) {                                       \
  738|  24.7k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  24.7k|  }
aom_highbd_smooth_v_predictor_16x4_c:
  737|  25.6k|      const uint16_t *left, int bd) {                                       \
  738|  25.6k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  25.6k|  }
aom_highbd_smooth_v_predictor_8x32_c:
  737|  10.4k|      const uint16_t *left, int bd) {                                       \
  738|  10.4k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  10.4k|  }
aom_highbd_smooth_v_predictor_32x8_c:
  737|  32.1k|      const uint16_t *left, int bd) {                                       \
  738|  32.1k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  32.1k|  }
aom_highbd_smooth_v_predictor_16x64_c:
  737|  2.55k|      const uint16_t *left, int bd) {                                       \
  738|  2.55k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  2.55k|  }
aom_highbd_smooth_v_predictor_64x16_c:
  737|  14.0k|      const uint16_t *left, int bd) {                                       \
  738|  14.0k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  14.0k|  }
aom_highbd_smooth_h_predictor_4x4_c:
  737|   141k|      const uint16_t *left, int bd) {                                       \
  738|   141k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|   141k|  }
aom_highbd_smooth_h_predictor_8x8_c:
  737|   105k|      const uint16_t *left, int bd) {                                       \
  738|   105k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|   105k|  }
aom_highbd_smooth_h_predictor_16x16_c:
  737|  56.3k|      const uint16_t *left, int bd) {                                       \
  738|  56.3k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  56.3k|  }
aom_highbd_smooth_h_predictor_32x32_c:
  737|  43.5k|      const uint16_t *left, int bd) {                                       \
  738|  43.5k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  43.5k|  }
aom_highbd_smooth_h_predictor_64x64_c:
  737|  6.11k|      const uint16_t *left, int bd) {                                       \
  738|  6.11k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  6.11k|  }
aom_highbd_smooth_h_predictor_4x8_c:
  737|  18.3k|      const uint16_t *left, int bd) {                                       \
  738|  18.3k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  18.3k|  }
aom_highbd_smooth_h_predictor_8x4_c:
  737|  38.1k|      const uint16_t *left, int bd) {                                       \
  738|  38.1k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  38.1k|  }
aom_highbd_smooth_h_predictor_8x16_c:
  737|  29.6k|      const uint16_t *left, int bd) {                                       \
  738|  29.6k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  29.6k|  }
aom_highbd_smooth_h_predictor_16x8_c:
  737|  54.7k|      const uint16_t *left, int bd) {                                       \
  738|  54.7k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  54.7k|  }
aom_highbd_smooth_h_predictor_16x32_c:
  737|  14.3k|      const uint16_t *left, int bd) {                                       \
  738|  14.3k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  14.3k|  }
aom_highbd_smooth_h_predictor_32x16_c:
  737|  13.2k|      const uint16_t *left, int bd) {                                       \
  738|  13.2k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  13.2k|  }
aom_highbd_smooth_h_predictor_32x64_c:
  737|  1.22k|      const uint16_t *left, int bd) {                                       \
  738|  1.22k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  1.22k|  }
aom_highbd_smooth_h_predictor_64x32_c:
  737|  1.18k|      const uint16_t *left, int bd) {                                       \
  738|  1.18k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  1.18k|  }
aom_highbd_smooth_h_predictor_4x16_c:
  737|  19.4k|      const uint16_t *left, int bd) {                                       \
  738|  19.4k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  19.4k|  }
aom_highbd_smooth_h_predictor_16x4_c:
  737|  36.3k|      const uint16_t *left, int bd) {                                       \
  738|  36.3k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  36.3k|  }
aom_highbd_smooth_h_predictor_8x32_c:
  737|  8.47k|      const uint16_t *left, int bd) {                                       \
  738|  8.47k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  8.47k|  }
aom_highbd_smooth_h_predictor_32x8_c:
  737|  41.3k|      const uint16_t *left, int bd) {                                       \
  738|  41.3k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  41.3k|  }
aom_highbd_smooth_h_predictor_16x64_c:
  737|  3.13k|      const uint16_t *left, int bd) {                                       \
  738|  3.13k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  3.13k|  }
aom_highbd_smooth_h_predictor_64x16_c:
  737|  6.10k|      const uint16_t *left, int bd) {                                       \
  738|  6.10k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  6.10k|  }
aom_highbd_paeth_predictor_4x4_c:
  737|   202k|      const uint16_t *left, int bd) {                                       \
  738|   202k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|   202k|  }
aom_highbd_paeth_predictor_8x8_c:
  737|   227k|      const uint16_t *left, int bd) {                                       \
  738|   227k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|   227k|  }
aom_highbd_paeth_predictor_16x16_c:
  737|   126k|      const uint16_t *left, int bd) {                                       \
  738|   126k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|   126k|  }
aom_highbd_paeth_predictor_32x32_c:
  737|   277k|      const uint16_t *left, int bd) {                                       \
  738|   277k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|   277k|  }
aom_highbd_paeth_predictor_64x64_c:
  737|  53.1k|      const uint16_t *left, int bd) {                                       \
  738|  53.1k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  53.1k|  }
aom_highbd_paeth_predictor_4x8_c:
  737|  72.4k|      const uint16_t *left, int bd) {                                       \
  738|  72.4k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  72.4k|  }
aom_highbd_paeth_predictor_8x4_c:
  737|   125k|      const uint16_t *left, int bd) {                                       \
  738|   125k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|   125k|  }
aom_highbd_paeth_predictor_8x16_c:
  737|  74.6k|      const uint16_t *left, int bd) {                                       \
  738|  74.6k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  74.6k|  }
aom_highbd_paeth_predictor_16x8_c:
  737|  89.6k|      const uint16_t *left, int bd) {                                       \
  738|  89.6k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  89.6k|  }
aom_highbd_paeth_predictor_16x32_c:
  737|  1.13M|      const uint16_t *left, int bd) {                                       \
  738|  1.13M|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  1.13M|  }
aom_highbd_paeth_predictor_32x16_c:
  737|  31.0k|      const uint16_t *left, int bd) {                                       \
  738|  31.0k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  31.0k|  }
aom_highbd_paeth_predictor_32x64_c:
  737|  5.01k|      const uint16_t *left, int bd) {                                       \
  738|  5.01k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  5.01k|  }
aom_highbd_paeth_predictor_64x32_c:
  737|  3.98k|      const uint16_t *left, int bd) {                                       \
  738|  3.98k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  3.98k|  }
aom_highbd_paeth_predictor_4x16_c:
  737|  98.1k|      const uint16_t *left, int bd) {                                       \
  738|  98.1k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  98.1k|  }
aom_highbd_paeth_predictor_16x4_c:
  737|   118k|      const uint16_t *left, int bd) {                                       \
  738|   118k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|   118k|  }
aom_highbd_paeth_predictor_8x32_c:
  737|  27.0k|      const uint16_t *left, int bd) {                                       \
  738|  27.0k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  27.0k|  }
aom_highbd_paeth_predictor_32x8_c:
  737|  80.5k|      const uint16_t *left, int bd) {                                       \
  738|  80.5k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  80.5k|  }
aom_highbd_paeth_predictor_16x64_c:
  737|   284k|      const uint16_t *left, int bd) {                                       \
  738|   284k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|   284k|  }
aom_highbd_paeth_predictor_64x16_c:
  737|  8.25k|      const uint16_t *left, int bd) {                                       \
  738|  8.25k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  8.25k|  }
aom_highbd_dc_128_predictor_64x64_c:
  737|  17.9k|      const uint16_t *left, int bd) {                                       \
  738|  17.9k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  17.9k|  }
aom_highbd_dc_128_predictor_32x64_c:
  737|    841|      const uint16_t *left, int bd) {                                       \
  738|    841|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    841|  }
aom_highbd_dc_128_predictor_64x32_c:
  737|  5.40k|      const uint16_t *left, int bd) {                                       \
  738|  5.40k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  5.40k|  }
aom_highbd_dc_128_predictor_4x16_c:
  737|    164|      const uint16_t *left, int bd) {                                       \
  738|    164|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    164|  }
aom_highbd_dc_128_predictor_16x4_c:
  737|    353|      const uint16_t *left, int bd) {                                       \
  738|    353|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    353|  }
aom_highbd_dc_128_predictor_8x32_c:
  737|    120|      const uint16_t *left, int bd) {                                       \
  738|    120|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    120|  }
aom_highbd_dc_128_predictor_32x8_c:
  737|    265|      const uint16_t *left, int bd) {                                       \
  738|    265|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    265|  }
aom_highbd_dc_128_predictor_16x64_c:
  737|  1.39k|      const uint16_t *left, int bd) {                                       \
  738|  1.39k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  1.39k|  }
aom_highbd_dc_128_predictor_64x16_c:
  737|  2.94k|      const uint16_t *left, int bd) {                                       \
  738|  2.94k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  2.94k|  }
aom_highbd_dc_left_predictor_64x64_c:
  737|  40.9k|      const uint16_t *left, int bd) {                                       \
  738|  40.9k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  40.9k|  }
aom_highbd_dc_left_predictor_32x64_c:
  737|  1.99k|      const uint16_t *left, int bd) {                                       \
  738|  1.99k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  1.99k|  }
aom_highbd_dc_left_predictor_64x32_c:
  737|    896|      const uint16_t *left, int bd) {                                       \
  738|    896|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    896|  }
aom_highbd_dc_left_predictor_4x16_c:
  737|  3.20k|      const uint16_t *left, int bd) {                                       \
  738|  3.20k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  3.20k|  }
aom_highbd_dc_left_predictor_16x4_c:
  737|  6.97k|      const uint16_t *left, int bd) {                                       \
  738|  6.97k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  6.97k|  }
aom_highbd_dc_left_predictor_8x32_c:
  737|  4.82k|      const uint16_t *left, int bd) {                                       \
  738|  4.82k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  4.82k|  }
aom_highbd_dc_left_predictor_32x8_c:
  737|  13.6k|      const uint16_t *left, int bd) {                                       \
  738|  13.6k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  13.6k|  }
aom_highbd_dc_left_predictor_16x64_c:
  737|  4.76k|      const uint16_t *left, int bd) {                                       \
  738|  4.76k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  4.76k|  }
aom_highbd_dc_left_predictor_64x16_c:
  737|    393|      const uint16_t *left, int bd) {                                       \
  738|    393|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    393|  }
aom_highbd_dc_top_predictor_64x64_c:
  737|  31.8k|      const uint16_t *left, int bd) {                                       \
  738|  31.8k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  31.8k|  }
aom_highbd_dc_top_predictor_32x64_c:
  737|  2.03k|      const uint16_t *left, int bd) {                                       \
  738|  2.03k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  2.03k|  }
aom_highbd_dc_top_predictor_64x32_c:
  737|    958|      const uint16_t *left, int bd) {                                       \
  738|    958|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    958|  }
aom_highbd_dc_top_predictor_4x16_c:
  737|  13.0k|      const uint16_t *left, int bd) {                                       \
  738|  13.0k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  13.0k|  }
aom_highbd_dc_top_predictor_16x4_c:
  737|  3.52k|      const uint16_t *left, int bd) {                                       \
  738|  3.52k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  3.52k|  }
aom_highbd_dc_top_predictor_8x32_c:
  737|  10.4k|      const uint16_t *left, int bd) {                                       \
  738|  10.4k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  10.4k|  }
aom_highbd_dc_top_predictor_32x8_c:
  737|  13.4k|      const uint16_t *left, int bd) {                                       \
  738|  13.4k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  13.4k|  }
aom_highbd_dc_top_predictor_16x64_c:
  737|    469|      const uint16_t *left, int bd) {                                       \
  738|    469|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|    469|  }
aom_highbd_dc_top_predictor_64x16_c:
  737|  4.80k|      const uint16_t *left, int bd) {                                       \
  738|  4.80k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|  4.80k|  }
aom_highbd_dc_predictor_64x64_c:
  737|   169k|      const uint16_t *left, int bd) {                                       \
  738|   169k|    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
  739|   169k|  }
intrapred.c:divide_using_multiply_shift:
  237|  1.75M|                                              int multiplier, int shift2) {
  238|  1.75M|  const int interm = num >> shift1;
  239|  1.75M|  return interm * multiplier >> shift2;
  240|  1.75M|}
intrapred.c:highbd_dc_predictor_rect:
  592|  1.75M|                                            int shift1, uint32_t multiplier) {
  593|  1.75M|  int sum = 0;
  594|  1.75M|  (void)bd;
  595|       |
  596|  37.4M|  for (int i = 0; i < bw; i++) {
  ------------------
  |  Branch (596:19): [True: 35.7M, False: 1.75M]
  ------------------
  597|  35.7M|    sum += above[i];
  598|  35.7M|  }
  599|  20.5M|  for (int i = 0; i < bh; i++) {
  ------------------
  |  Branch (599:19): [True: 18.7M, False: 1.75M]
  ------------------
  600|  18.7M|    sum += left[i];
  601|  18.7M|  }
  602|       |
  603|  1.75M|  const int expected_dc = divide_using_multiply_shift(
  604|  1.75M|      sum + ((bw + bh) >> 1), shift1, multiplier, HIGHBD_DC_SHIFT2);
  ------------------
  |  |  586|  1.75M|#define HIGHBD_DC_SHIFT2 17
  ------------------
  605|  1.75M|  assert(expected_dc < (1 << bd));
  606|       |
  607|  20.5M|  for (int r = 0; r < bh; r++) {
  ------------------
  |  Branch (607:19): [True: 18.7M, False: 1.75M]
  ------------------
  608|  18.7M|    aom_memset16(dst, expected_dc, bw);
  609|  18.7M|    dst += stride;
  610|  18.7M|  }
  611|  1.75M|}
intrapred.c:highbd_v_predictor:
  373|   118k|                                      const uint16_t *left, int bd) {
  374|   118k|  int r;
  375|   118k|  (void)left;
  376|   118k|  (void)bd;
  377|  1.77M|  for (r = 0; r < bh; r++) {
  ------------------
  |  Branch (377:15): [True: 1.65M, False: 118k]
  ------------------
  378|  1.65M|    memcpy(dst, above, bw * sizeof(uint16_t));
  379|  1.65M|    dst += stride;
  380|  1.65M|  }
  381|   118k|}
intrapred.c:highbd_h_predictor:
  385|   552k|                                      const uint16_t *left, int bd) {
  386|   552k|  int r;
  387|   552k|  (void)above;
  388|   552k|  (void)bd;
  389|  6.91M|  for (r = 0; r < bh; r++) {
  ------------------
  |  Branch (389:15): [True: 6.36M, False: 552k]
  ------------------
  390|  6.36M|    aom_memset16(dst, left[r], bw);
  391|  6.36M|    dst += stride;
  392|  6.36M|  }
  393|   552k|}
intrapred.c:highbd_smooth_predictor:
  412|  1.77M|                                           const uint16_t *left, int bd) {
  413|  1.77M|  (void)bd;
  414|  1.77M|  const uint16_t below_pred = left[bh - 1];   // estimated by bottom-left pixel
  415|  1.77M|  const uint16_t right_pred = above[bw - 1];  // estimated by top-right pixel
  416|  1.77M|  const uint8_t *const sm_weights_w = smooth_weights + bw - 4;
  417|  1.77M|  const uint8_t *const sm_weights_h = smooth_weights + bh - 4;
  418|       |  // scale = 2 * 2^SMOOTH_WEIGHT_LOG2_SCALE
  419|  1.77M|  const int log2_scale = 1 + SMOOTH_WEIGHT_LOG2_SCALE;
  ------------------
  |  |   19|  1.77M|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  420|  1.77M|  const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  1.77M|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  421|  7.08M|  sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale,
  ------------------
  |  |   76|  1.77M|  assert(weights_w[0] < weights_scale);                               \
  |  |   77|  1.77M|  assert(weights_h[0] < weights_scale);                               \
  |  |   78|  1.77M|  assert(weights_scale - weights_w[bw - 1] < weights_scale);          \
  |  |   79|  1.77M|  assert(weights_scale - weights_h[bh - 1] < weights_scale);          \
  |  |   80|  1.77M|  assert(pred_scale < 31)  // ensures no overflow when calculating predictor.
  ------------------
  422|  1.77M|                           log2_scale + sizeof(*dst));
  423|  1.77M|  int r;
  424|  24.2M|  for (r = 0; r < bh; ++r) {
  ------------------
  |  Branch (424:15): [True: 22.9M, False: 1.33M]
  ------------------
  425|  22.9M|    int c;
  426|   549M|    for (c = 0; c < bw; ++c) {
  ------------------
  |  Branch (426:17): [True: 527M, False: 22.4M]
  ------------------
  427|   527M|      const uint16_t pixels[] = { above[c], below_pred, left[r], right_pred };
  428|   527M|      const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r],
  429|   527M|                                  sm_weights_w[c], scale - sm_weights_w[c] };
  430|   527M|      uint32_t this_pred = 0;
  431|   527M|      int i;
  432|   527M|      assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]);
  433|  2.63G|      for (i = 0; i < 4; ++i) {
  ------------------
  |  Branch (433:19): [True: 2.10G, False: 526M]
  ------------------
  434|  2.10G|        this_pred += weights[i] * pixels[i];
  435|  2.10G|      }
  436|   526M|      dst[c] = divide_round(this_pred, log2_scale);
  ------------------
  |  |   82|   526M|#define divide_round(value, bits) (((value) + (1 << ((bits)-1))) >> (bits))
  ------------------
  437|   526M|    }
  438|  22.4M|    dst += stride;
  439|  22.4M|  }
  440|  1.77M|}
intrapred.c:highbd_smooth_v_predictor:
  445|   536k|                                             const uint16_t *left, int bd) {
  446|   536k|  (void)bd;
  447|   536k|  const uint16_t below_pred = left[bh - 1];  // estimated by bottom-left pixel
  448|   536k|  const uint8_t *const sm_weights = smooth_weights + bh - 4;
  449|       |  // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE
  450|   536k|  const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE;
  ------------------
  |  |   19|   536k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  451|   536k|  const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|   536k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  452|  2.14M|  sm_weights_sanity_checks(sm_weights, sm_weights, scale,
  ------------------
  |  |   76|   536k|  assert(weights_w[0] < weights_scale);                               \
  |  |   77|   536k|  assert(weights_h[0] < weights_scale);                               \
  |  |   78|   536k|  assert(weights_scale - weights_w[bw - 1] < weights_scale);          \
  |  |   79|   536k|  assert(weights_scale - weights_h[bh - 1] < weights_scale);          \
  |  |   80|   536k|  assert(pred_scale < 31)  // ensures no overflow when calculating predictor.
  ------------------
  453|   536k|                           log2_scale + sizeof(*dst));
  454|       |
  455|   536k|  int r;
  456|  7.63M|  for (r = 0; r < bh; r++) {
  ------------------
  |  Branch (456:15): [True: 7.11M, False: 520k]
  ------------------
  457|  7.11M|    int c;
  458|   168M|    for (c = 0; c < bw; ++c) {
  ------------------
  |  Branch (458:17): [True: 161M, False: 7.09M]
  ------------------
  459|   161M|      const uint16_t pixels[] = { above[c], below_pred };
  460|   161M|      const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] };
  461|   161M|      uint32_t this_pred = 0;
  462|   161M|      assert(scale >= sm_weights[r]);
  463|   161M|      int i;
  464|   485M|      for (i = 0; i < 2; ++i) {
  ------------------
  |  Branch (464:19): [True: 323M, False: 161M]
  ------------------
  465|   323M|        this_pred += weights[i] * pixels[i];
  466|   323M|      }
  467|   161M|      dst[c] = divide_round(this_pred, log2_scale);
  ------------------
  |  |   82|   161M|#define divide_round(value, bits) (((value) + (1 << ((bits)-1))) >> (bits))
  ------------------
  468|   161M|    }
  469|  7.09M|    dst += stride;
  470|  7.09M|  }
  471|   536k|}
intrapred.c:highbd_smooth_h_predictor:
  476|   638k|                                             const uint16_t *left, int bd) {
  477|   638k|  (void)bd;
  478|   638k|  const uint16_t right_pred = above[bw - 1];  // estimated by top-right pixel
  479|   638k|  const uint8_t *const sm_weights = smooth_weights + bw - 4;
  480|       |  // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE
  481|   638k|  const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE;
  ------------------
  |  |   19|   638k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  482|   638k|  const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|   638k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  483|  2.55M|  sm_weights_sanity_checks(sm_weights, sm_weights, scale,
  ------------------
  |  |   76|   638k|  assert(weights_w[0] < weights_scale);                               \
  |  |   77|   638k|  assert(weights_h[0] < weights_scale);                               \
  |  |   78|   638k|  assert(weights_scale - weights_w[bw - 1] < weights_scale);          \
  |  |   79|   638k|  assert(weights_scale - weights_h[bh - 1] < weights_scale);          \
  |  |   80|   638k|  assert(pred_scale < 31)  // ensures no overflow when calculating predictor.
  ------------------
  484|   638k|                           log2_scale + sizeof(*dst));
  485|       |
  486|   638k|  int r;
  487|  8.08M|  for (r = 0; r < bh; r++) {
  ------------------
  |  Branch (487:15): [True: 7.44M, False: 635k]
  ------------------
  488|  7.44M|    int c;
  489|   157M|    for (c = 0; c < bw; ++c) {
  ------------------
  |  Branch (489:17): [True: 150M, False: 7.44M]
  ------------------
  490|   150M|      const uint16_t pixels[] = { left[r], right_pred };
  491|   150M|      const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] };
  492|   150M|      uint32_t this_pred = 0;
  493|   150M|      assert(scale >= sm_weights[c]);
  494|   150M|      int i;
  495|   451M|      for (i = 0; i < 2; ++i) {
  ------------------
  |  Branch (495:19): [True: 300M, False: 150M]
  ------------------
  496|   300M|        this_pred += weights[i] * pixels[i];
  497|   300M|      }
  498|   150M|      dst[c] = divide_round(this_pred, log2_scale);
  ------------------
  |  |   82|   150M|#define divide_round(value, bits) (((value) + (1 << ((bits)-1))) >> (bits))
  ------------------
  499|   150M|    }
  500|  7.44M|    dst += stride;
  501|  7.44M|  }
  502|   638k|}
intrapred.c:paeth_predictor_single:
   48|  1.53G|                                              uint16_t top_left) {
   49|  1.53G|  const int base = top + left - top_left;
   50|  1.53G|  const int p_left = abs_diff(base, left);
   51|  1.53G|  const int p_top = abs_diff(base, top);
   52|  1.53G|  const int p_top_left = abs_diff(base, top_left);
   53|       |
   54|       |  // Return nearest to base of left, top and top_left.
   55|  1.53G|  return (p_left <= p_top && p_left <= p_top_left) ? left
  ------------------
  |  Branch (55:11): [True: 1.44G, False: 96.2M]
  |  Branch (55:30): [True: 1.42G, False: 10.5M]
  ------------------
   56|  1.53G|         : (p_top <= p_top_left)                   ? top
  ------------------
  |  Branch (56:12): [True: 91.0M, False: 15.7M]
  ------------------
   57|   106M|                                                   : top_left;
   58|  1.53G|}
intrapred.c:abs_diff:
   45|  4.60G|static inline int abs_diff(int a, int b) { return (a > b) ? a - b : b - a; }
  ------------------
  |  Branch (45:51): [True: 284M, False: 4.32G]
  ------------------
intrapred.c:highbd_paeth_predictor:
  397|  3.04M|                                          const uint16_t *left, int bd) {
  398|  3.04M|  int r, c;
  399|  3.04M|  const uint16_t ytop_left = above[-1];
  400|  3.04M|  (void)bd;
  401|       |
  402|  82.1M|  for (r = 0; r < bh; r++) {
  ------------------
  |  Branch (402:15): [True: 79.1M, False: 3.04M]
  ------------------
  403|  1.61G|    for (c = 0; c < bw; c++)
  ------------------
  |  Branch (403:17): [True: 1.53G, False: 79.1M]
  ------------------
  404|  1.53G|      dst[c] = paeth_predictor_single(left[r], above[c], ytop_left);
  405|  79.1M|    dst += stride;
  406|  79.1M|  }
  407|  3.04M|}
intrapred.c:highbd_dc_128_predictor:
  507|  29.4k|                                           const uint16_t *left, int bd) {
  508|  29.4k|  int r;
  509|  29.4k|  (void)above;
  510|  29.4k|  (void)left;
  511|       |
  512|  1.55M|  for (r = 0; r < bh; r++) {
  ------------------
  |  Branch (512:15): [True: 1.52M, False: 29.4k]
  ------------------
  513|  1.52M|    aom_memset16(dst, 128 << (bd - 8), bw);
  514|  1.52M|    dst += stride;
  515|  1.52M|  }
  516|  29.4k|}
intrapred.c:highbd_dc_left_predictor:
  521|  77.6k|                                            const uint16_t *left, int bd) {
  522|  77.6k|  int i, r, expected_dc, sum = 0;
  523|  77.6k|  (void)above;
  524|  77.6k|  (void)bd;
  525|       |
  526|  3.50M|  for (i = 0; i < bh; i++) sum += left[i];
  ------------------
  |  Branch (526:15): [True: 3.43M, False: 77.6k]
  ------------------
  527|  77.6k|  expected_dc = (sum + (bh >> 1)) / bh;
  528|       |
  529|  3.50M|  for (r = 0; r < bh; r++) {
  ------------------
  |  Branch (529:15): [True: 3.43M, False: 77.6k]
  ------------------
  530|  3.43M|    aom_memset16(dst, expected_dc, bw);
  531|  3.43M|    dst += stride;
  532|  3.43M|  }
  533|  77.6k|}
intrapred.c:highbd_dc_top_predictor:
  538|  80.6k|                                           const uint16_t *left, int bd) {
  539|  80.6k|  int i, r, expected_dc, sum = 0;
  540|  80.6k|  (void)left;
  541|  80.6k|  (void)bd;
  542|       |
  543|  3.18M|  for (i = 0; i < bw; i++) sum += above[i];
  ------------------
  |  Branch (543:15): [True: 3.10M, False: 80.6k]
  ------------------
  544|  80.6k|  expected_dc = (sum + (bw >> 1)) / bw;
  545|       |
  546|  3.05M|  for (r = 0; r < bh; r++) {
  ------------------
  |  Branch (546:15): [True: 2.97M, False: 80.6k]
  ------------------
  547|  2.97M|    aom_memset16(dst, expected_dc, bw);
  548|  2.97M|    dst += stride;
  549|  2.97M|  }
  550|  80.6k|}
intrapred.c:highbd_dc_predictor:
  554|   169k|                                       const uint16_t *left, int bd) {
  555|   169k|  int i, r, expected_dc, sum = 0;
  556|   169k|  const int count = bw + bh;
  557|   169k|  (void)bd;
  558|       |
  559|  11.0M|  for (i = 0; i < bw; i++) {
  ------------------
  |  Branch (559:15): [True: 10.8M, False: 169k]
  ------------------
  560|  10.8M|    sum += above[i];
  561|  10.8M|  }
  562|  11.0M|  for (i = 0; i < bh; i++) {
  ------------------
  |  Branch (562:15): [True: 10.8M, False: 169k]
  ------------------
  563|  10.8M|    sum += left[i];
  564|  10.8M|  }
  565|       |
  566|   169k|  expected_dc = (sum + (count >> 1)) / count;
  567|       |
  568|  10.9M|  for (r = 0; r < bh; r++) {
  ------------------
  |  Branch (568:15): [True: 10.8M, False: 169k]
  ------------------
  569|  10.8M|    aom_memset16(dst, expected_dc, bw);
  570|  10.8M|    dst += stride;
  571|  10.8M|  }
  572|   169k|}

decodeframe.c:update_cdf:
  110|  12.9M|static inline void update_cdf(aom_cdf_prob *cdf, int8_t val, int nsymbs) {
  111|  12.9M|  assert(nsymbs < 17);
  112|  12.9M|  const int count = cdf[nsymbs];
  113|       |
  114|       |  // rate is computed in the spec as:
  115|       |  //  3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
  116|       |  // In this case cdf[N] is |count|.
  117|       |  // Min(FloorLog2(N), 2) is 1 for nsymbs == {2, 3} and 2 for all
  118|       |  // nsymbs > 3. So the equation becomes:
  119|       |  //  4 + (count > 15) + (count > 31) + (nsymbs > 3).
  120|       |  // Note that the largest value for count is 32 (it is not incremented beyond
  121|       |  // 32). So using that information:
  122|       |  //  count >> 4 is 0 for count from 0 to 15.
  123|       |  //  count >> 4 is 1 for count from 16 to 31.
  124|       |  //  count >> 4 is 2 for count == 31.
  125|       |  // Now, the equation becomes:
  126|       |  //  4 + (count >> 4) + (nsymbs > 3).
  127|  12.9M|  const int rate = 4 + (count >> 4) + (nsymbs > 3);
  128|       |
  129|  12.9M|  int i = 0;
  130|  79.8M|  do {
  131|  79.8M|    if (i < val) {
  ------------------
  |  Branch (131:9): [True: 24.7M, False: 55.1M]
  ------------------
  132|  24.7M|      cdf[i] += (CDF_PROB_TOP - cdf[i]) >> rate;
  ------------------
  |  |   33|  24.7M|#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
  |  |  ------------------
  |  |  |  |   32|  24.7M|#define CDF_PROB_BITS 15
  |  |  ------------------
  ------------------
  133|  55.1M|    } else {
  134|  55.1M|      cdf[i] -= cdf[i] >> rate;
  135|  55.1M|    }
  136|  79.8M|  } while (++i < nsymbs - 1);
  ------------------
  |  Branch (136:12): [True: 66.8M, False: 12.9M]
  ------------------
  137|  12.9M|  cdf[nsymbs] += (count < 32);
  138|  12.9M|}
decodemv.c:update_cdf:
  110|   106M|static inline void update_cdf(aom_cdf_prob *cdf, int8_t val, int nsymbs) {
  111|   106M|  assert(nsymbs < 17);
  112|   106M|  const int count = cdf[nsymbs];
  113|       |
  114|       |  // rate is computed in the spec as:
  115|       |  //  3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
  116|       |  // In this case cdf[N] is |count|.
  117|       |  // Min(FloorLog2(N), 2) is 1 for nsymbs == {2, 3} and 2 for all
  118|       |  // nsymbs > 3. So the equation becomes:
  119|       |  //  4 + (count > 15) + (count > 31) + (nsymbs > 3).
  120|       |  // Note that the largest value for count is 32 (it is not incremented beyond
  121|       |  // 32). So using that information:
  122|       |  //  count >> 4 is 0 for count from 0 to 15.
  123|       |  //  count >> 4 is 1 for count from 16 to 31.
  124|       |  //  count >> 4 is 2 for count == 31.
  125|       |  // Now, the equation becomes:
  126|       |  //  4 + (count >> 4) + (nsymbs > 3).
  127|   106M|  const int rate = 4 + (count >> 4) + (nsymbs > 3);
  128|       |
  129|   106M|  int i = 0;
  130|   518M|  do {
  131|   518M|    if (i < val) {
  ------------------
  |  Branch (131:9): [True: 194M, False: 324M]
  ------------------
  132|   194M|      cdf[i] += (CDF_PROB_TOP - cdf[i]) >> rate;
  ------------------
  |  |   33|   194M|#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
  |  |  ------------------
  |  |  |  |   32|   194M|#define CDF_PROB_BITS 15
  |  |  ------------------
  ------------------
  133|   324M|    } else {
  134|   324M|      cdf[i] -= cdf[i] >> rate;
  135|   324M|    }
  136|   518M|  } while (++i < nsymbs - 1);
  ------------------
  |  Branch (136:12): [True: 411M, False: 106M]
  ------------------
  137|   106M|  cdf[nsymbs] += (count < 32);
  138|   106M|}
decodetxb.c:update_cdf:
  110|   360M|static inline void update_cdf(aom_cdf_prob *cdf, int8_t val, int nsymbs) {
  111|   360M|  assert(nsymbs < 17);
  112|   360M|  const int count = cdf[nsymbs];
  113|       |
  114|       |  // rate is computed in the spec as:
  115|       |  //  3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
  116|       |  // In this case cdf[N] is |count|.
  117|       |  // Min(FloorLog2(N), 2) is 1 for nsymbs == {2, 3} and 2 for all
  118|       |  // nsymbs > 3. So the equation becomes:
  119|       |  //  4 + (count > 15) + (count > 31) + (nsymbs > 3).
  120|       |  // Note that the largest value for count is 32 (it is not incremented beyond
  121|       |  // 32). So using that information:
  122|       |  //  count >> 4 is 0 for count from 0 to 15.
  123|       |  //  count >> 4 is 1 for count from 16 to 31.
  124|       |  //  count >> 4 is 2 for count == 31.
  125|       |  // Now, the equation becomes:
  126|       |  //  4 + (count >> 4) + (nsymbs > 3).
  127|   360M|  const int rate = 4 + (count >> 4) + (nsymbs > 3);
  128|       |
  129|   360M|  int i = 0;
  130|  1.01G|  do {
  131|  1.01G|    if (i < val) {
  ------------------
  |  Branch (131:9): [True: 227M, False: 783M]
  ------------------
  132|   227M|      cdf[i] += (CDF_PROB_TOP - cdf[i]) >> rate;
  ------------------
  |  |   33|   227M|#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
  |  |  ------------------
  |  |  |  |   32|   227M|#define CDF_PROB_BITS 15
  |  |  ------------------
  ------------------
  133|   783M|    } else {
  134|   783M|      cdf[i] -= cdf[i] >> rate;
  135|   783M|    }
  136|  1.01G|  } while (++i < nsymbs - 1);
  ------------------
  |  Branch (136:12): [True: 649M, False: 360M]
  ------------------
  137|   360M|  cdf[nsymbs] += (count < 32);
  138|   360M|}
detokenize.c:update_cdf:
  110|  24.4M|static inline void update_cdf(aom_cdf_prob *cdf, int8_t val, int nsymbs) {
  111|  24.4M|  assert(nsymbs < 17);
  112|  24.4M|  const int count = cdf[nsymbs];
  113|       |
  114|       |  // rate is computed in the spec as:
  115|       |  //  3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
  116|       |  // In this case cdf[N] is |count|.
  117|       |  // Min(FloorLog2(N), 2) is 1 for nsymbs == {2, 3} and 2 for all
  118|       |  // nsymbs > 3. So the equation becomes:
  119|       |  //  4 + (count > 15) + (count > 31) + (nsymbs > 3).
  120|       |  // Note that the largest value for count is 32 (it is not incremented beyond
  121|       |  // 32). So using that information:
  122|       |  //  count >> 4 is 0 for count from 0 to 15.
  123|       |  //  count >> 4 is 1 for count from 16 to 31.
  124|       |  //  count >> 4 is 2 for count == 31.
  125|       |  // Now, the equation becomes:
  126|       |  //  4 + (count >> 4) + (nsymbs > 3).
  127|  24.4M|  const int rate = 4 + (count >> 4) + (nsymbs > 3);
  128|       |
  129|  24.4M|  int i = 0;
  130|  83.1M|  do {
  131|  83.1M|    if (i < val) {
  ------------------
  |  Branch (131:9): [True: 14.9M, False: 68.2M]
  ------------------
  132|  14.9M|      cdf[i] += (CDF_PROB_TOP - cdf[i]) >> rate;
  ------------------
  |  |   33|  14.9M|#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
  |  |  ------------------
  |  |  |  |   32|  14.9M|#define CDF_PROB_BITS 15
  |  |  ------------------
  ------------------
  133|  68.2M|    } else {
  134|  68.2M|      cdf[i] -= cdf[i] >> rate;
  135|  68.2M|    }
  136|  83.1M|  } while (++i < nsymbs - 1);
  ------------------
  |  Branch (136:12): [True: 58.7M, False: 24.4M]
  ------------------
  137|  24.4M|  cdf[nsymbs] += (count < 32);
  138|  24.4M|}

bitreader_buffer.c:inv_recenter_finite_nonneg:
   32|   115k|                                                  uint16_t v) {
   33|   115k|  if ((r << 1) <= n) {
  ------------------
  |  Branch (33:7): [True: 96.6k, False: 18.5k]
  ------------------
   34|  96.6k|    return inv_recenter_nonneg(r, v);
   35|  96.6k|  } else {
   36|  18.5k|    return n - 1 - inv_recenter_nonneg(n - 1 - r, v);
   37|  18.5k|  }
   38|   115k|}
bitreader_buffer.c:inv_recenter_nonneg:
   20|   115k|static inline uint16_t inv_recenter_nonneg(uint16_t r, uint16_t v) {
   21|   115k|  if (v > (r << 1))
  ------------------
  |  Branch (21:7): [True: 15.6k, False: 99.4k]
  ------------------
   22|  15.6k|    return v;
   23|  99.4k|  else if ((v & 1) == 0)
  ------------------
  |  Branch (23:12): [True: 62.6k, False: 36.8k]
  ------------------
   24|  62.6k|    return (v >> 1) + r;
   25|  36.8k|  else
   26|  36.8k|    return r - ((v + 1) >> 1);
   27|   115k|}
binary_codes_reader.c:inv_recenter_finite_nonneg:
   32|   384k|                                                  uint16_t v) {
   33|   384k|  if ((r << 1) <= n) {
  ------------------
  |  Branch (33:7): [True: 233k, False: 150k]
  ------------------
   34|   233k|    return inv_recenter_nonneg(r, v);
   35|   233k|  } else {
   36|   150k|    return n - 1 - inv_recenter_nonneg(n - 1 - r, v);
   37|   150k|  }
   38|   384k|}
binary_codes_reader.c:inv_recenter_nonneg:
   20|   384k|static inline uint16_t inv_recenter_nonneg(uint16_t r, uint16_t v) {
   21|   384k|  if (v > (r << 1))
  ------------------
  |  Branch (21:7): [True: 59.9k, False: 324k]
  ------------------
   22|  59.9k|    return v;
   23|   324k|  else if ((v & 1) == 0)
  ------------------
  |  Branch (23:12): [True: 171k, False: 153k]
  ------------------
   24|   171k|    return (v >> 1) + r;
   25|   153k|  else
   26|   153k|    return r - ((v + 1) >> 1);
   27|   384k|}

cdef_block_avx2.c:v128_load_unaligned:
   46|   724M|SIMD_INLINE v128 v128_load_unaligned(const void *p) {
   47|   724M|#if defined(__SSSE3__)
   48|   724M|  return _mm_lddqu_si128((__m128i *)p);
   49|       |#else
   50|       |  return _mm_loadu_si128((__m128i *)p);
   51|       |#endif
   52|   724M|}
cdef_block_avx2.c:v128_sub_16:
  120|  65.4k|SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return _mm_sub_epi16(a, b); }
cdef_block_avx2.c:v128_shr_s16:
  562|  65.4k|SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
  563|  65.4k|  return _mm_sra_epi16(a, _mm_cvtsi32_si128((int)c));
  564|  65.4k|}
cdef_block_avx2.c:v128_dup_16:
   86|  65.4k|SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16((short)x); }
cdef_block_avx2.c:v128_add_16:
   98|   523k|SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return _mm_add_epi16(a, b); }
cdef_block_avx2.c:v128_shuffle_8:
  300|  49.0k|SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
  301|  49.0k|#if defined(__SSSE3__)
  302|  49.0k|  return _mm_shuffle_epi8(x, pattern);
  303|       |#else
  304|       |  v128 output;
  305|       |  unsigned char *input = (unsigned char *)&x;
  306|       |  unsigned char *index = (unsigned char *)&pattern;
  307|       |  unsigned char *selected = (unsigned char *)&output;
  308|       |  int counter;
  309|       |
  310|       |  for (counter = 0; counter < 16; counter++) {
  311|       |    selected[counter] = input[index[counter] & 15];
  312|       |  }
  313|       |
  314|       |  return output;
  315|       |#endif
  316|  49.0k|}
cdef_block_avx2.c:v128_ziplo_16:
  155|  81.8k|SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) {
  156|  81.8k|  return _mm_unpacklo_epi16(b, a);
  157|  81.8k|}
cdef_block_avx2.c:v128_ziphi_16:
  159|  81.8k|SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) {
  160|  81.8k|  return _mm_unpackhi_epi16(b, a);
  161|  81.8k|}
cdef_block_avx2.c:v128_add_32:
  106|  98.1k|SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return _mm_add_epi32(a, b); }
cdef_block_avx2.c:v128_from_32:
   38|   147k|SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
   39|   147k|  return _mm_set_epi32((int)a, (int)b, (int)c, (int)d);
   40|   147k|}
cdef_block_avx2.c:v128_madd_s16:
  425|   114k|SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return _mm_madd_epi16(a, b); }
cdef_block_avx2.c:v128_mullo_s32:
  406|   114k|SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
  407|   114k|#if defined(__SSE4_1__)
  408|   114k|  return _mm_mullo_epi32(a, b);
  409|       |#else
  410|       |  return _mm_unpacklo_epi32(
  411|       |      _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8),
  412|       |      _mm_shuffle_epi32(
  413|       |          _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)), 8));
  414|       |#endif
  415|   114k|}
cdef_block_avx2.c:v128_dup_32:
   88|  16.3k|SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32((int)x); }
cdef_block_avx2.c:v128_ziplo_32:
  163|  65.4k|SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) {
  164|  65.4k|  return _mm_unpacklo_epi32(b, a);
  165|  65.4k|}
cdef_block_avx2.c:v128_ziphi_32:
  167|  65.4k|SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) {
  168|  65.4k|  return _mm_unpackhi_epi32(b, a);
  169|  65.4k|}
cdef_block_avx2.c:v128_ziplo_64:
  171|  65.4k|SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) {
  172|  65.4k|  return _mm_unpacklo_epi64(b, a);
  173|  65.4k|}
cdef_block_avx2.c:v128_ziphi_64:
  175|  65.4k|SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) {
  176|  65.4k|  return _mm_unpackhi_epi64(b, a);
  177|  65.4k|}
cdef_block_avx2.c:v128_max_s32:
  503|  24.5k|SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) {
  504|  24.5k|#if defined(__SSE4_1__)
  505|  24.5k|  return _mm_max_epi32(a, b);
  506|       |#else
  507|       |  v128 mask = _mm_cmplt_epi32(b, a);
  508|       |  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
  509|       |#endif
  510|  24.5k|}
cdef_block_avx2.c:v128_low_u32:
   20|  8.18k|SIMD_INLINE uint32_t v128_low_u32(v128 a) {
   21|  8.18k|  return (uint32_t)_mm_cvtsi128_si32(a);
   22|  8.18k|}
cdef_block_avx2.c:v128_pack_s32_s16:
  255|  8.18k|SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
  256|  8.18k|  return _mm_packs_epi32(b, a);
  257|  8.18k|}
cdef_block_avx2.c:v128_cmpeq_32:
  526|  16.3k|SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return _mm_cmpeq_epi32(a, b); }
cdef_block_avx2.c:v128_movemask_8:
  470|  8.18k|SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return _mm_movemask_epi8(a); }
cdef_block_avx2.c:v128_pack_s16_s8:
  272|  8.18k|SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
  273|  8.18k|  return _mm_packs_epi16(b, a);
  274|  8.18k|}
cdef_block_avx2.c:v128_load_aligned:
   42|   150M|SIMD_INLINE v128 v128_load_aligned(const void *p) {
   43|   150M|  return _mm_load_si128((__m128i *)p);
   44|   150M|}
cdef_block_avx2.c:v128_high_v64:
   28|   106M|SIMD_INLINE v64 v128_high_v64(v128 a) { return _mm_srli_si128(a, 8); }
cdef_block_avx2.c:v128_low_v64:
   24|   109M|SIMD_INLINE v64 v128_low_v64(v128 a) {
   25|   109M|  return _mm_unpacklo_epi64(a, v64_zero());
   26|   109M|}
cdef_block_avx2.c:v128_from_v64:
   30|   401M|SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) {
   31|   401M|  return _mm_unpacklo_epi64(b, a);
   32|   401M|}
cdef_block_avx2.c:v128_pack_s16_u8:
  268|  11.0M|SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
  269|  11.0M|  return _mm_packus_epi16(b, a);
  270|  11.0M|}
cdef_block_avx2.c:v128_store_unaligned:
   58|   151M|SIMD_INLINE void v128_store_unaligned(void *p, v128 a) {
   59|   151M|  _mm_storeu_si128((__m128i *)p, a);
   60|   151M|}

cdef_block_avx2.c:v256_dup_16:
   83|   463M|SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16((short)x); }
cdef_block_avx2.c:v256_zero:
   79|   124M|SIMD_INLINE v256 v256_zero(void) { return _mm256_setzero_si256(); }
cdef_block_avx2.c:v256_from_v128:
   50|   386M|SIMD_INLINE v256 v256_from_v128(v128 a, v128 b) {
   51|       |  // gcc seems to be missing _mm256_set_m128i()
   52|   386M|  return _mm256_inserti128_si256(_mm256_castsi128_si256(b), a, 1);
   53|   386M|}
cdef_block_avx2.c:v256_sub_16:
  121|   321M|SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return _mm256_sub_epi16(a, b); }
cdef_block_avx2.c:v256_abs_s16:
  135|   321M|SIMD_INLINE v256 v256_abs_s16(v256 a) { return _mm256_abs_epi16(a); }
cdef_block_avx2.c:v256_ssub_u16:
  127|   332M|SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) {
  128|   332M|  return _mm256_subs_epu16(a, b);
  129|   332M|}
cdef_block_avx2.c:v256_shr_u16:
  622|   330M|SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
  623|   330M|  return _mm256_srl_epi16(a, _mm_cvtsi32_si128((int)c));
  624|   330M|}
cdef_block_avx2.c:v256_xor:
  493|   336M|SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return _mm256_xor_si256(a, b); }
cdef_block_avx2.c:v256_add_16:
   93|   719M|SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return _mm256_add_epi16(a, b); }
cdef_block_avx2.c:v256_mullo_s16:
  506|   139M|SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
  507|   139M|  return _mm256_mullo_epi16(a, b);
  508|   139M|}
cdef_block_avx2.c:v256_max_u8:
  544|   190M|SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return _mm256_max_epu8(a, b); }
cdef_block_avx2.c:v256_and:
  495|   242M|SIMD_INLINE v256 v256_and(v256 a, v256 b) { return _mm256_and_si256(a, b); }
cdef_block_avx2.c:v256_min_s16:
  558|   490M|SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return _mm256_min_epi16(a, b); }
cdef_block_avx2.c:v256_cmplt_s16:
  582|  68.2M|SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
  583|  68.2M|  return _mm256_cmpgt_epi16(b, a);
  584|  68.2M|}
cdef_block_avx2.c:v256_max_s16:
  560|   267M|SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return _mm256_max_epi16(a, b); }
cdef_block_avx2.c:v256_low_v128:
   44|  80.9M|SIMD_INLINE v128 v256_low_v128(v256 a) { return _mm256_castsi256_si128(a); }
cdef_block_avx2.c:v256_pack_s16_u8:
  303|  42.5M|SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
  304|  42.5M|  return _mm256_permute4x64_epi64(_mm256_packus_epi16(b, a),
  305|  42.5M|                                  _MM_SHUFFLE(3, 1, 2, 0));
  306|  42.5M|}
cdef_block_avx2.c:v256_high_v128:
   46|  36.6M|SIMD_INLINE v128 v256_high_v128(v256 a) {
   47|  36.6M|  return _mm256_extracti128_si256(a, 1);
   48|  36.6M|}
cdef_block_avx2.c:v256_from_v64:
   55|   201M|SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
   56|   201M|  return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d));
   57|   201M|}

cdef_block_avx2.c:v64_store_aligned:
   85|   147M|SIMD_INLINE void v64_store_aligned(void *p, v64 a) {
   86|   147M|  _mm_storel_epi64((__m128i *)p, a);
   87|   147M|}
cdef_block_avx2.c:v64_zero:
  102|   109M|SIMD_INLINE v64 v64_zero(void) { return _mm_setzero_si128(); }
cdef_block_avx2.c:v64_load_aligned:
   77|   187M|SIMD_INLINE v64 v64_load_aligned(const void *p) {
   78|   187M|  return _mm_loadl_epi64((__m128i *)p);
   79|   187M|}
cdef_block_avx2.c:v64_load_unaligned:
   81|   689M|SIMD_INLINE v64 v64_load_unaligned(const void *p) {
   82|   689M|  return _mm_loadl_epi64((__m128i *)p);
   83|   689M|}
cdef_block_avx2.c:u32_store_aligned:
   69|  83.6M|SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
   70|  83.6M|  *((uint32_t *)p) = a;
   71|  83.6M|}
cdef_block_avx2.c:v64_high_u32:
   29|  42.7M|SIMD_INLINE uint32_t v64_high_u32(v64 a) {
   30|  42.7M|  return (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4));
   31|  42.7M|}
cdef_block_avx2.c:v64_low_u32:
   25|  44.4M|SIMD_INLINE uint32_t v64_low_u32(v64 a) {
   26|  44.4M|  return (uint32_t)_mm_cvtsi128_si32(a);
   27|  44.4M|}

aom_convolve_copy_avx2:
   29|  1.70M|                            uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
   30|       |  // The w == 16 case uses _mm_store_si128(), which requires its output address
   31|       |  // be aligned on a 16-byte boundary.
   32|  1.70M|  if (w == 16) {
  ------------------
  |  Branch (32:7): [True: 310k, False: 1.39M]
  ------------------
   33|   310k|    assert(!((intptr_t)dst % 16));
   34|   310k|    assert(!(dst_stride % 16));
   35|   310k|  }
   36|       |
   37|  1.70M|  if (w == 2) {
  ------------------
  |  Branch (37:7): [True: 147k, False: 1.55M]
  ------------------
   38|   355k|    do {
   39|   355k|      memmove(dst, src, 2 * sizeof(*src));
   40|   355k|      src += src_stride;
   41|   355k|      dst += dst_stride;
   42|   355k|      memmove(dst, src, 2 * sizeof(*src));
   43|   355k|      src += src_stride;
   44|   355k|      dst += dst_stride;
   45|   355k|      h -= 2;
   46|   355k|    } while (h);
  ------------------
  |  Branch (46:14): [True: 207k, False: 147k]
  ------------------
   47|  1.55M|  } else if (w == 4) {
  ------------------
  |  Branch (47:14): [True: 557k, False: 999k]
  ------------------
   48|  1.88M|    do {
   49|  1.88M|      memmove(dst, src, 4 * sizeof(*src));
   50|  1.88M|      src += src_stride;
   51|  1.88M|      dst += dst_stride;
   52|  1.88M|      memmove(dst, src, 4 * sizeof(*src));
   53|  1.88M|      src += src_stride;
   54|  1.88M|      dst += dst_stride;
   55|  1.88M|      h -= 2;
   56|  1.88M|    } while (h);
  ------------------
  |  Branch (56:14): [True: 1.32M, False: 557k]
  ------------------
   57|   999k|  } else if (w == 8) {
  ------------------
  |  Branch (57:14): [True: 521k, False: 478k]
  ------------------
   58|  2.09M|    do {
   59|  2.09M|      __m128i s[2];
   60|  2.09M|      s[0] = _mm_loadl_epi64((__m128i *)src);
   61|  2.09M|      src += src_stride;
   62|  2.09M|      s[1] = _mm_loadl_epi64((__m128i *)src);
   63|  2.09M|      src += src_stride;
   64|  2.09M|      _mm_storel_epi64((__m128i *)dst, s[0]);
   65|  2.09M|      dst += dst_stride;
   66|  2.09M|      _mm_storel_epi64((__m128i *)dst, s[1]);
   67|  2.09M|      dst += dst_stride;
   68|  2.09M|      h -= 2;
   69|  2.09M|    } while (h);
  ------------------
  |  Branch (69:14): [True: 1.57M, False: 521k]
  ------------------
   70|   521k|  } else if (w == 16) {
  ------------------
  |  Branch (70:14): [True: 310k, False: 167k]
  ------------------
   71|  1.71M|    do {
   72|  1.71M|      __m128i s[2];
   73|  1.71M|      s[0] = _mm_loadu_si128((__m128i *)src);
   74|  1.71M|      src += src_stride;
   75|  1.71M|      s[1] = _mm_loadu_si128((__m128i *)src);
   76|  1.71M|      src += src_stride;
   77|  1.71M|      _mm_store_si128((__m128i *)dst, s[0]);
   78|  1.71M|      dst += dst_stride;
   79|  1.71M|      _mm_store_si128((__m128i *)dst, s[1]);
   80|  1.71M|      dst += dst_stride;
   81|  1.71M|      h -= 2;
   82|  1.71M|    } while (h);
  ------------------
  |  Branch (82:14): [True: 1.40M, False: 310k]
  ------------------
   83|   310k|  } else if (w == 32) {
  ------------------
  |  Branch (83:14): [True: 79.7k, False: 88.1k]
  ------------------
   84|   887k|    do {
   85|   887k|      __m256i s[2];
   86|   887k|      s[0] = _mm256_loadu_si256((__m256i *)src);
   87|   887k|      src += src_stride;
   88|   887k|      s[1] = _mm256_loadu_si256((__m256i *)src);
   89|   887k|      src += src_stride;
   90|   887k|      _mm256_storeu_si256((__m256i *)dst, s[0]);
   91|   887k|      dst += dst_stride;
   92|   887k|      _mm256_storeu_si256((__m256i *)dst, s[1]);
   93|   887k|      dst += dst_stride;
   94|   887k|      h -= 2;
   95|   887k|    } while (h);
  ------------------
  |  Branch (95:14): [True: 808k, False: 79.7k]
  ------------------
   96|  88.1k|  } else if (w == 64) {
  ------------------
  |  Branch (96:14): [True: 42.3k, False: 45.8k]
  ------------------
   97|  1.95M|    do {
   98|  1.95M|      __m256i s[4];
   99|  1.95M|      s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
  100|  1.95M|      s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
  101|  1.95M|      src += src_stride;
  102|  1.95M|      s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
  103|  1.95M|      s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
  104|  1.95M|      src += src_stride;
  105|  1.95M|      _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
  106|  1.95M|      _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
  107|  1.95M|      dst += dst_stride;
  108|  1.95M|      _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[2]);
  109|  1.95M|      _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[3]);
  110|  1.95M|      dst += dst_stride;
  111|  1.95M|      h -= 2;
  112|  1.95M|    } while (h);
  ------------------
  |  Branch (112:14): [True: 1.91M, False: 42.3k]
  ------------------
  113|  45.8k|  } else {
  114|  2.93M|    do {
  115|  2.93M|      copy_128(src, dst);
  116|  2.93M|      src += src_stride;
  117|  2.93M|      dst += dst_stride;
  118|  2.93M|      copy_128(src, dst);
  119|  2.93M|      src += src_stride;
  120|  2.93M|      dst += dst_stride;
  121|  2.93M|      h -= 2;
  122|  2.93M|    } while (h);
  ------------------
  |  Branch (122:14): [True: 2.88M, False: 45.8k]
  ------------------
  123|  45.8k|  }
  124|  1.70M|}
aom_highbd_convolve_copy_avx2:
  163|  1.23M|                                   int h) {
  164|       |  // The w == 8 case uses _mm_store_si128(), which requires its output address
  165|       |  // be aligned on a 16-byte boundary.
  166|  1.23M|  if (w == 8) {
  ------------------
  |  Branch (166:7): [True: 401k, False: 829k]
  ------------------
  167|   401k|    assert(!((intptr_t)dst % 16));
  168|   401k|    assert(!(dst_stride % 8));
  169|   401k|  }
  170|       |
  171|  1.23M|  if (w == 2) {
  ------------------
  |  Branch (171:7): [True: 111k, False: 1.11M]
  ------------------
  172|   243k|    do {
  173|   243k|      memmove(dst, src, 2 * sizeof(*src));
  174|   243k|      src += src_stride;
  175|   243k|      dst += dst_stride;
  176|   243k|      memmove(dst, src, 2 * sizeof(*src));
  177|   243k|      src += src_stride;
  178|   243k|      dst += dst_stride;
  179|   243k|      h -= 2;
  180|   243k|    } while (h);
  ------------------
  |  Branch (180:14): [True: 131k, False: 111k]
  ------------------
  181|  1.11M|  } else if (w == 4) {
  ------------------
  |  Branch (181:14): [True: 390k, False: 729k]
  ------------------
  182|  1.27M|    do {
  183|  1.27M|      __m128i s[2];
  184|  1.27M|      s[0] = _mm_loadl_epi64((__m128i *)src);
  185|  1.27M|      src += src_stride;
  186|  1.27M|      s[1] = _mm_loadl_epi64((__m128i *)src);
  187|  1.27M|      src += src_stride;
  188|  1.27M|      _mm_storel_epi64((__m128i *)dst, s[0]);
  189|  1.27M|      dst += dst_stride;
  190|  1.27M|      _mm_storel_epi64((__m128i *)dst, s[1]);
  191|  1.27M|      dst += dst_stride;
  192|  1.27M|      h -= 2;
  193|  1.27M|    } while (h);
  ------------------
  |  Branch (193:14): [True: 883k, False: 390k]
  ------------------
  194|   729k|  } else if (w == 8) {
  ------------------
  |  Branch (194:14): [True: 401k, False: 327k]
  ------------------
  195|  1.45M|    do {
  196|  1.45M|      __m128i s[2];
  197|  1.45M|      s[0] = _mm_loadu_si128((__m128i *)src);
  198|  1.45M|      src += src_stride;
  199|  1.45M|      s[1] = _mm_loadu_si128((__m128i *)src);
  200|  1.45M|      src += src_stride;
  201|  1.45M|      _mm_store_si128((__m128i *)dst, s[0]);
  202|  1.45M|      dst += dst_stride;
  203|  1.45M|      _mm_store_si128((__m128i *)dst, s[1]);
  204|  1.45M|      dst += dst_stride;
  205|  1.45M|      h -= 2;
  206|  1.45M|    } while (h);
  ------------------
  |  Branch (206:14): [True: 1.04M, False: 401k]
  ------------------
  207|   401k|  } else if (w == 16) {
  ------------------
  |  Branch (207:14): [True: 252k, False: 74.9k]
  ------------------
  208|  1.26M|    do {
  209|  1.26M|      __m256i s[2];
  210|  1.26M|      s[0] = _mm256_loadu_si256((__m256i *)src);
  211|  1.26M|      src += src_stride;
  212|  1.26M|      s[1] = _mm256_loadu_si256((__m256i *)src);
  213|  1.26M|      src += src_stride;
  214|  1.26M|      _mm256_storeu_si256((__m256i *)dst, s[0]);
  215|  1.26M|      dst += dst_stride;
  216|  1.26M|      _mm256_storeu_si256((__m256i *)dst, s[1]);
  217|  1.26M|      dst += dst_stride;
  218|  1.26M|      h -= 2;
  219|  1.26M|    } while (h);
  ------------------
  |  Branch (219:14): [True: 1.01M, False: 252k]
  ------------------
  220|   252k|  } else if (w == 32) {
  ------------------
  |  Branch (220:14): [True: 58.3k, False: 16.6k]
  ------------------
  221|   590k|    do {
  222|   590k|      __m256i s[4];
  223|   590k|      s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
  224|   590k|      s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
  225|   590k|      src += src_stride;
  226|   590k|      s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
  227|   590k|      s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
  228|   590k|      src += src_stride;
  229|   590k|      _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
  230|   590k|      _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
  231|   590k|      dst += dst_stride;
  232|   590k|      _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[2]);
  233|   590k|      _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[3]);
  234|   590k|      dst += dst_stride;
  235|   590k|      h -= 2;
  236|   590k|    } while (h);
  ------------------
  |  Branch (236:14): [True: 532k, False: 58.3k]
  ------------------
  237|  58.3k|  } else if (w == 64) {
  ------------------
  |  Branch (237:14): [True: 14.1k, False: 2.56k]
  ------------------
  238|   310k|    do {
  239|   310k|      highbd_copy_64(src, dst);
  240|   310k|      src += src_stride;
  241|   310k|      dst += dst_stride;
  242|   310k|      highbd_copy_64(src, dst);
  243|   310k|      src += src_stride;
  244|   310k|      dst += dst_stride;
  245|   310k|      h -= 2;
  246|   310k|    } while (h);
  ------------------
  |  Branch (246:14): [True: 296k, False: 14.1k]
  ------------------
  247|  14.1k|  } else {
  248|  2.56k|    assert(w == 128);
  249|   149k|    do {
  250|   149k|      highbd_copy_128(src, dst);
  251|   149k|      src += src_stride;
  252|   149k|      dst += dst_stride;
  253|   149k|      highbd_copy_128(src, dst);
  254|   149k|      src += src_stride;
  255|   149k|      dst += dst_stride;
  256|   149k|      h -= 2;
  257|   149k|    } while (h);
  ------------------
  |  Branch (257:14): [True: 146k, False: 2.68k]
  ------------------
  258|  2.68k|  }
  259|  1.23M|}
aom_convolve_copy_avx2.c:copy_128:
   16|  5.86M|static inline void copy_128(const uint8_t *src, uint8_t *dst) {
   17|  5.86M|  __m256i s[4];
   18|  5.86M|  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
   19|  5.86M|  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
   20|  5.86M|  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 32));
   21|  5.86M|  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 32));
   22|  5.86M|  _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
   23|  5.86M|  _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
   24|  5.86M|  _mm256_storeu_si256((__m256i *)(dst + 2 * 32), s[2]);
   25|  5.86M|  _mm256_storeu_si256((__m256i *)(dst + 3 * 32), s[3]);
   26|  5.86M|}
aom_convolve_copy_avx2.c:highbd_copy_64:
  128|   620k|static inline void highbd_copy_64(const uint16_t *src, uint16_t *dst) {
  129|   620k|  __m256i s[4];
  130|   620k|  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
  131|   620k|  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
  132|   620k|  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
  133|   620k|  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
  134|   620k|  _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
  135|   620k|  _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
  136|   620k|  _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
  137|   620k|  _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
  138|   620k|}
aom_convolve_copy_avx2.c:highbd_copy_128:
  140|   296k|static inline void highbd_copy_128(const uint16_t *src, uint16_t *dst) {
  141|   296k|  __m256i s[8];
  142|   296k|  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
  143|   296k|  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
  144|   296k|  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
  145|   296k|  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
  146|   296k|  s[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
  147|   296k|  s[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 16));
  148|   296k|  s[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 16));
  149|   296k|  s[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 16));
  150|       |
  151|   296k|  _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
  152|   296k|  _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
  153|   296k|  _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
  154|   296k|  _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
  155|   296k|  _mm256_storeu_si256((__m256i *)(dst + 4 * 16), s[4]);
  156|   296k|  _mm256_storeu_si256((__m256i *)(dst + 5 * 16), s[5]);
  157|   296k|  _mm256_storeu_si256((__m256i *)(dst + 6 * 16), s[6]);
  158|   296k|  _mm256_storeu_si256((__m256i *)(dst + 7 * 16), s[7]);
  159|   296k|}

aom_blend_a64_hmask_sse4_1:
   22|   995k|                                const uint8_t *mask, int w, int h) {
   23|   995k|  aom_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1,
   24|   995k|                            src1_stride, mask, 0, w, h, 0, 0);
   25|   995k|}
aom_highbd_blend_a64_hmask_sse4_1:
   31|   867k|    const uint8_t *mask, int w, int h, int bd) {
   32|   867k|  aom_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride,
   33|   867k|                                   src1_8, src1_stride, mask, 0, w, h, 0, 0,
   34|   867k|                                   bd);
   35|   867k|}

aom_lowbd_blend_a64_d16_mask_avx2:
  288|   125k|    ConvolveParams *conv_params) {
  289|   125k|  const int bd = 8;
  290|   125k|  const int round_bits =
  291|   125k|      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|   125k|#define FILTER_BITS 7
  ------------------
  292|       |
  293|   125k|  const int round_offset =
  294|   125k|      ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
  295|   125k|       (1 << (round_bits - 1)))
  296|   125k|      << AOM_BLEND_A64_ROUND_BITS;
  ------------------
  |  |   23|   125k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  297|       |
  298|   125k|  const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
  ------------------
  |  |   23|   125k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  299|   125k|  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
  300|   125k|  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
  301|       |
  302|   125k|  assert(h >= 4);
  303|   125k|  assert(w >= 4);
  304|   125k|  assert(IS_POWER_OF_TWO(h));
  305|   125k|  assert(IS_POWER_OF_TWO(w));
  306|   125k|  const __m128i v_round_offset = _mm_set1_epi32(round_offset);
  307|   125k|  const __m256i y_round_offset = _mm256_set1_epi32(round_offset);
  308|       |
  309|   125k|  if (subw == 0 && subh == 0) {
  ------------------
  |  Branch (309:7): [True: 43.5k, False: 82.3k]
  |  Branch (309:20): [True: 43.5k, False: 0]
  ------------------
  310|  43.5k|    switch (w) {
  311|      0|      case 4:
  ------------------
  |  Branch (311:7): [True: 0, False: 43.5k]
  ------------------
  312|      0|        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
  313|      0|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  314|      0|            mask_stride, h, &v_round_offset, shift);
  315|      0|        break;
  316|  16.4k|      case 8:
  ------------------
  |  Branch (316:7): [True: 16.4k, False: 27.1k]
  ------------------
  317|  16.4k|        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
  318|  16.4k|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  319|  16.4k|            mask_stride, h, &v_round_offset, shift);
  320|  16.4k|        break;
  321|  16.7k|      case 16:
  ------------------
  |  Branch (321:7): [True: 16.7k, False: 26.8k]
  ------------------
  322|  16.7k|        lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
  323|  16.7k|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  324|  16.7k|            mask_stride, h, &y_round_offset, shift);
  325|  16.7k|        break;
  326|  10.4k|      default:
  ------------------
  |  Branch (326:7): [True: 10.4k, False: 33.1k]
  ------------------
  327|  10.4k|        lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
  328|  10.4k|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  329|  10.4k|            mask_stride, h, w, &y_round_offset, shift);
  330|  10.4k|        break;
  331|  43.5k|    }
  332|  82.4k|  } else if (subw == 1 && subh == 1) {
  ------------------
  |  Branch (332:14): [True: 82.4k, False: 18.4E]
  |  Branch (332:27): [True: 82.1k, False: 218]
  ------------------
  333|  82.1k|    switch (w) {
  334|  31.5k|      case 4:
  ------------------
  |  Branch (334:7): [True: 31.5k, False: 50.5k]
  ------------------
  335|  31.5k|        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
  336|  31.5k|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  337|  31.5k|            mask_stride, h, &v_round_offset, shift);
  338|  31.5k|        break;
  339|  30.9k|      case 8:
  ------------------
  |  Branch (339:7): [True: 30.9k, False: 51.2k]
  ------------------
  340|  30.9k|        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
  341|  30.9k|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  342|  30.9k|            mask_stride, h, &v_round_offset, shift);
  343|  30.9k|        break;
  344|  12.8k|      case 16:
  ------------------
  |  Branch (344:7): [True: 12.8k, False: 69.3k]
  ------------------
  345|  12.8k|        lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
  346|  12.8k|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  347|  12.8k|            mask_stride, h, &y_round_offset, shift);
  348|  12.8k|        break;
  349|  6.80k|      default:
  ------------------
  |  Branch (349:7): [True: 6.80k, False: 75.3k]
  ------------------
  350|  6.80k|        lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
  351|  6.80k|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  352|  6.80k|            mask_stride, h, w, &y_round_offset, shift);
  353|  6.80k|        break;
  354|  82.1k|    }
  355|  82.1k|  } else if (subw == 1 && subh == 0) {
  ------------------
  |  Branch (355:14): [True: 218, False: 18.4E]
  |  Branch (355:27): [True: 218, False: 0]
  ------------------
  356|    218|    switch (w) {
  357|     36|      case 4:
  ------------------
  |  Branch (357:7): [True: 36, False: 182]
  ------------------
  358|     36|        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
  359|     36|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  360|     36|            mask_stride, h, &v_round_offset, shift);
  361|     36|        break;
  362|     50|      case 8:
  ------------------
  |  Branch (362:7): [True: 50, False: 168]
  ------------------
  363|     50|        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
  364|     50|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  365|     50|            mask_stride, h, &v_round_offset, shift);
  366|     50|        break;
  367|     66|      case 16:
  ------------------
  |  Branch (367:7): [True: 66, False: 152]
  ------------------
  368|     66|        lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
  369|     66|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  370|     66|            mask_stride, h, w, &y_round_offset, shift);
  371|     66|        break;
  372|     66|      default:
  ------------------
  |  Branch (372:7): [True: 66, False: 152]
  ------------------
  373|     66|        lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
  374|     66|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  375|     66|            mask_stride, h, w, &y_round_offset, shift);
  376|     66|        break;
  377|    218|    }
  378|  18.4E|  } else {
  379|  18.4E|    switch (w) {
  380|      0|      case 4:
  ------------------
  |  Branch (380:7): [True: 0, False: 18.4E]
  ------------------
  381|      0|        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
  382|      0|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  383|      0|            mask_stride, h, &v_round_offset, shift);
  384|      0|        break;
  385|      0|      case 8:
  ------------------
  |  Branch (385:7): [True: 0, False: 18.4E]
  ------------------
  386|      0|        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
  387|      0|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  388|      0|            mask_stride, h, &v_round_offset, shift);
  389|      0|        break;
  390|      0|      case 16:
  ------------------
  |  Branch (390:7): [True: 0, False: 18.4E]
  ------------------
  391|      0|        lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
  392|      0|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  393|      0|            mask_stride, h, w, &y_round_offset, shift);
  394|      0|        break;
  395|      0|      default:
  ------------------
  |  Branch (395:7): [True: 0, False: 18.4E]
  ------------------
  396|      0|        lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
  397|      0|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  398|      0|            mask_stride, h, w, &y_round_offset, shift);
  399|      0|        break;
  400|  18.4E|    }
  401|  18.4E|  }
  402|   125k|}
aom_blend_a64_mask_avx2:
  873|   433k|                             int h, int subw, int subh) {
  874|   433k|  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
  875|   433k|  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
  876|       |
  877|   433k|  assert(h >= 1);
  878|   433k|  assert(w >= 1);
  879|   433k|  assert(IS_POWER_OF_TWO(h));
  880|   433k|  assert(IS_POWER_OF_TWO(w));
  881|       |
  882|   433k|  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
  ------------------
  |  |   55|   433k|#define UNLIKELY(v) __builtin_expect(v, 0)
  |  |  ------------------
  |  |  |  Branch (55:21): [True: 0, False: 433k]
  |  |  ------------------
  ------------------
  883|      0|    aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
  884|      0|                         mask, mask_stride, w, h, subw, subh);
  885|   433k|  } else {
  886|   433k|    if (subw & subh) {
  ------------------
  |  Branch (886:9): [True: 66.4k, False: 366k]
  ------------------
  887|  66.4k|      blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
  888|  66.4k|                                src1_stride, mask, mask_stride, w, h);
  889|   366k|    } else if (subw) {
  ------------------
  |  Branch (889:16): [True: 36, False: 366k]
  ------------------
  890|     36|      blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1,
  891|     36|                             src1_stride, mask, mask_stride, w, h);
  892|   366k|    } else if (subh) {
  ------------------
  |  Branch (892:16): [True: 0, False: 366k]
  ------------------
  893|      0|      blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
  894|      0|                             src1_stride, mask, mask_stride, w, h);
  895|   366k|    } else {
  896|   366k|      blend_a64_mask_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride,
  897|   366k|                          mask, mask_stride, w, h);
  898|   366k|    }
  899|   433k|  }
  900|   433k|}
aom_highbd_blend_a64_d16_mask_avx2:
 1297|   284k|    ConvolveParams *conv_params, const int bd) {
 1298|   284k|  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  ------------------
  |  |   75|   284k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 1299|   284k|  const int round_bits =
 1300|   284k|      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|   284k|#define FILTER_BITS 7
  ------------------
 1301|   284k|  const int32_t round_offset =
 1302|   284k|      ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
 1303|   284k|       (1 << (round_bits - 1)))
 1304|   284k|      << AOM_BLEND_A64_ROUND_BITS;
  ------------------
  |  |   23|   284k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
 1305|   284k|  const __m256i v_round_offset = _mm256_set1_epi32(round_offset);
 1306|   284k|  const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
  ------------------
  |  |   23|   284k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
 1307|       |
 1308|   284k|  const __m256i clip_low = _mm256_setzero_si256();
 1309|   284k|  const __m256i clip_high = _mm256_set1_epi16((1 << bd) - 1);
 1310|   284k|  const __m256i mask_max = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|   284k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|   284k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
 1311|       |
 1312|   284k|  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
 1313|   284k|  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
 1314|       |
 1315|   284k|  assert(h >= 4);
 1316|   284k|  assert(w >= 4);
 1317|   284k|  assert(IS_POWER_OF_TWO(h));
 1318|   284k|  assert(IS_POWER_OF_TWO(w));
 1319|       |
 1320|   284k|  if (subw == 0 && subh == 0) {
  ------------------
  |  Branch (1320:7): [True: 96.2k, False: 188k]
  |  Branch (1320:20): [True: 96.2k, False: 0]
  ------------------
 1321|  96.2k|    switch (w) {
 1322|      0|      case 4:
  ------------------
  |  Branch (1322:7): [True: 0, False: 96.2k]
  ------------------
 1323|      0|        highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2(
 1324|      0|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
 1325|      0|            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
 1326|      0|            &mask_max);
 1327|      0|        break;
 1328|  36.6k|      case 8:
  ------------------
  |  Branch (1328:7): [True: 36.6k, False: 59.6k]
  ------------------
 1329|  36.6k|        highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2(
 1330|  36.6k|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
 1331|  36.6k|            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
 1332|  36.6k|            &mask_max);
 1333|  36.6k|        break;
 1334|  59.6k|      default:  // >= 16
  ------------------
  |  Branch (1334:7): [True: 59.6k, False: 36.6k]
  ------------------
 1335|  59.6k|        highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
 1336|  59.6k|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
 1337|  59.6k|            mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
 1338|  59.6k|            &mask_max);
 1339|  59.6k|        break;
 1340|  96.2k|    }
 1341|       |
 1342|   188k|  } else if (subw == 1 && subh == 1) {
  ------------------
  |  Branch (1342:14): [True: 188k, False: 18.4E]
  |  Branch (1342:27): [True: 188k, False: 122]
  ------------------
 1343|   188k|    switch (w) {
 1344|  72.0k|      case 4:
  ------------------
  |  Branch (1344:7): [True: 72.0k, False: 116k]
  ------------------
 1345|  72.0k|        highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2(
 1346|  72.0k|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
 1347|  72.0k|            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
 1348|  72.0k|            &mask_max);
 1349|  72.0k|        break;
 1350|  63.4k|      case 8:
  ------------------
  |  Branch (1350:7): [True: 63.4k, False: 125k]
  ------------------
 1351|  63.4k|        highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2(
 1352|  63.4k|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
 1353|  63.4k|            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
 1354|  63.4k|            &mask_max);
 1355|  63.4k|        break;
 1356|  53.0k|      default:  // >= 16
  ------------------
  |  Branch (1356:7): [True: 53.0k, False: 135k]
  ------------------
 1357|  53.0k|        highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
 1358|  53.0k|            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
 1359|  53.0k|            mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
 1360|  53.0k|            &mask_max);
 1361|  53.0k|        break;
 1362|   188k|    }
 1363|   188k|  } else {
 1364|       |    // Sub-sampling in only one axis doesn't seem to happen very much, so fall
 1365|       |    // back to the vanilla C implementation instead of having all the optimised
 1366|       |    // code for these.
 1367|    111|    aom_highbd_blend_a64_d16_mask_c(dst8, dst_stride, src0, src0_stride, src1,
 1368|    111|                                    src1_stride, mask, mask_stride, w, h, subw,
 1369|    111|                                    subh, conv_params, bd);
 1370|    111|  }
 1371|   284k|}
blend_a64_mask_avx2.c:lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2:
   86|  16.7k|    const __m256i *round_offset, int shift) {
   87|  16.7k|  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|  16.7k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|  16.7k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
   88|   448k|  for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (88:19): [True: 431k, False: 16.7k]
  ------------------
   89|   431k|    const __m128i m = xx_loadu_128(mask);
   90|   431k|    const __m256i m0 = _mm256_cvtepu8_epi16(m);
   91|       |
   92|   431k|    blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
   93|   431k|                                shift);
   94|   431k|    mask += mask_stride;
   95|   431k|    dst += dst_stride;
   96|   431k|    src0 += src0_stride;
   97|   431k|    src1 += src1_stride;
   98|   431k|  }
   99|  16.7k|}
blend_a64_mask_avx2.c:blend_a64_d16_mask_w16_avx2:
   31|   573k|    int shift) {
   32|   573k|  const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
   33|   573k|  const __m256i s0_0 = yy_loadu_256(src0);
   34|   573k|  const __m256i s1_0 = yy_loadu_256(src1);
   35|   573k|  __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
   36|   573k|                                      _mm256_unpacklo_epi16(*m0, max_minus_m0));
   37|   573k|  __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
   38|   573k|                                      _mm256_unpackhi_epi16(*m0, max_minus_m0));
   39|   573k|  res0_lo =
   40|   573k|      _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
   41|   573k|  res0_hi =
   42|   573k|      _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
   43|   573k|  const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
   44|   573k|  __m256i res = _mm256_packus_epi16(res0, res0);
   45|   573k|  res = _mm256_permute4x64_epi64(res, 0xd8);
   46|   573k|  _mm_storeu_si128((__m128i *)(dst), _mm256_castsi256_si128(res));
   47|   573k|}
blend_a64_mask_avx2.c:lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2:
  105|  10.4k|    const __m256i *round_offset, int shift) {
  106|  10.4k|  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|  10.4k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|  10.4k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  107|   389k|  for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (107:19): [True: 378k, False: 10.4k]
  ------------------
  108|  1.11M|    for (int j = 0; j < w; j += 32) {
  ------------------
  |  Branch (108:21): [True: 734k, False: 378k]
  ------------------
  109|   734k|      const __m256i m = yy_loadu_256(mask + j);
  110|   734k|      const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m));
  111|   734k|      const __m256i m1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m, 1));
  112|       |
  113|   734k|      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
  114|   734k|                                  round_offset, &v_maxval, shift);
  115|   734k|    }
  116|   378k|    mask += mask_stride;
  117|   378k|    dst += dst_stride;
  118|   378k|    src0 += src0_stride;
  119|   378k|    src1 += src1_stride;
  120|   378k|  }
  121|  10.4k|}
blend_a64_mask_avx2.c:blend_a64_d16_mask_w32_avx2:
   52|  1.01M|    const __m256i *v_maxval, int shift) {
   53|  1.01M|  const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
   54|  1.01M|  const __m256i max_minus_m1 = _mm256_sub_epi16(*v_maxval, *m1);
   55|  1.01M|  const __m256i s0_0 = yy_loadu_256(src0);
   56|  1.01M|  const __m256i s0_1 = yy_loadu_256(src0 + 16);
   57|  1.01M|  const __m256i s1_0 = yy_loadu_256(src1);
   58|  1.01M|  const __m256i s1_1 = yy_loadu_256(src1 + 16);
   59|  1.01M|  __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
   60|  1.01M|                                      _mm256_unpacklo_epi16(*m0, max_minus_m0));
   61|  1.01M|  __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
   62|  1.01M|                                      _mm256_unpackhi_epi16(*m0, max_minus_m0));
   63|  1.01M|  __m256i res1_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_1, s1_1),
   64|  1.01M|                                      _mm256_unpacklo_epi16(*m1, max_minus_m1));
   65|  1.01M|  __m256i res1_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_1, s1_1),
   66|  1.01M|                                      _mm256_unpackhi_epi16(*m1, max_minus_m1));
   67|  1.01M|  res0_lo =
   68|  1.01M|      _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
   69|  1.01M|  res0_hi =
   70|  1.01M|      _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
   71|  1.01M|  res1_lo =
   72|  1.01M|      _mm256_srai_epi32(_mm256_sub_epi32(res1_lo, *v_round_offset), shift);
   73|  1.01M|  res1_hi =
   74|  1.01M|      _mm256_srai_epi32(_mm256_sub_epi32(res1_hi, *v_round_offset), shift);
   75|  1.01M|  const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
   76|  1.01M|  const __m256i res1 = _mm256_packs_epi32(res1_lo, res1_hi);
   77|  1.01M|  __m256i res = _mm256_packus_epi16(res0, res1);
   78|  1.01M|  res = _mm256_permute4x64_epi64(res, 0xd8);
   79|  1.01M|  _mm256_storeu_si256((__m256i *)(dst), res);
   80|  1.01M|}
blend_a64_mask_avx2.c:lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2:
  127|  12.8k|    const __m256i *round_offset, int shift) {
  128|  12.8k|  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|  12.8k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|  12.8k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  129|  12.8k|  const __m256i one_b = _mm256_set1_epi8(1);
  130|  12.8k|  const __m256i two_w = _mm256_set1_epi16(2);
  131|   153k|  for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (131:19): [True: 140k, False: 12.8k]
  ------------------
  132|   140k|    const __m256i m_i00 = yy_loadu_256(mask);
  133|   140k|    const __m256i m_i10 = yy_loadu_256(mask + mask_stride);
  134|       |
  135|   140k|    const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
  136|   140k|    const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
  137|   140k|    const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
  138|       |
  139|   140k|    blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
  140|   140k|                                shift);
  141|   140k|    mask += mask_stride << 1;
  142|   140k|    dst += dst_stride;
  143|   140k|    src0 += src0_stride;
  144|   140k|    src1 += src1_stride;
  145|   140k|  }
  146|  12.8k|}
blend_a64_mask_avx2.c:lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2:
  152|  6.80k|    const __m256i *round_offset, int shift) {
  153|  6.80k|  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|  6.80k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|  6.80k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  154|  6.80k|  const __m256i one_b = _mm256_set1_epi8(1);
  155|  6.80k|  const __m256i two_w = _mm256_set1_epi16(2);
  156|   223k|  for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (156:19): [True: 216k, False: 6.80k]
  ------------------
  157|   490k|    for (int j = 0; j < w; j += 32) {
  ------------------
  |  Branch (157:21): [True: 273k, False: 216k]
  ------------------
  158|   273k|      const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
  159|   273k|      const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
  160|   273k|      const __m256i m_i10 = yy_loadu_256(mask + mask_stride + 2 * j);
  161|   273k|      const __m256i m_i11 = yy_loadu_256(mask + mask_stride + 2 * j + 32);
  162|       |
  163|   273k|      const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
  164|   273k|      const __m256i m1_ac = _mm256_adds_epu8(m_i01, m_i11);
  165|   273k|      const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
  166|   273k|      const __m256i m1_acbd = _mm256_maddubs_epi16(m1_ac, one_b);
  167|   273k|      const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
  168|   273k|      const __m256i m1 = _mm256_srli_epi16(_mm256_add_epi16(m1_acbd, two_w), 2);
  169|       |
  170|   273k|      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
  171|   273k|                                  round_offset, &v_maxval, shift);
  172|   273k|    }
  173|   216k|    mask += mask_stride << 1;
  174|   216k|    dst += dst_stride;
  175|   216k|    src0 += src0_stride;
  176|   216k|    src1 += src1_stride;
  177|   216k|  }
  178|  6.80k|}
blend_a64_mask_avx2.c:lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2:
  184|     66|    const __m256i *round_offset, int shift) {
  185|     66|  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|     66|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|     66|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  186|     66|  const __m256i one_b = _mm256_set1_epi8(1);
  187|     66|  const __m256i zeros = _mm256_setzero_si256();
  188|    994|  for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (188:19): [True: 928, False: 66]
  ------------------
  189|  1.85k|    for (int j = 0; j < w; j += 16) {
  ------------------
  |  Branch (189:21): [True: 928, False: 928]
  ------------------
  190|    928|      const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
  191|    928|      const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
  192|    928|      const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
  193|       |
  194|    928|      blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
  195|    928|                                  round_offset, &v_maxval, shift);
  196|    928|    }
  197|    928|    mask += mask_stride;
  198|    928|    dst += dst_stride;
  199|    928|    src0 += src0_stride;
  200|    928|    src1 += src1_stride;
  201|    928|  }
  202|     66|}
blend_a64_mask_avx2.c:lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2:
  208|     66|    const __m256i *round_offset, int shift) {
  209|     66|  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|     66|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|     66|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  210|     66|  const __m256i one_b = _mm256_set1_epi8(1);
  211|     66|  const __m256i zeros = _mm256_setzero_si256();
  212|  4.99k|  for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (212:19): [True: 4.92k, False: 66]
  ------------------
  213|  13.9k|    for (int j = 0; j < w; j += 32) {
  ------------------
  |  Branch (213:21): [True: 9.02k, False: 4.92k]
  ------------------
  214|  9.02k|      const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
  215|  9.02k|      const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
  216|  9.02k|      const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
  217|  9.02k|      const __m256i m1_ac = _mm256_maddubs_epi16(m_i01, one_b);
  218|  9.02k|      const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
  219|  9.02k|      const __m256i m1 = _mm256_avg_epu16(m1_ac, zeros);
  220|       |
  221|  9.02k|      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
  222|  9.02k|                                  round_offset, &v_maxval, shift);
  223|  9.02k|    }
  224|  4.92k|    mask += mask_stride;
  225|  4.92k|    dst += dst_stride;
  226|  4.92k|    src0 += src0_stride;
  227|  4.92k|    src1 += src1_stride;
  228|  4.92k|  }
  229|     66|}
blend_a64_mask_avx2.c:blend_a64_mask_sx_sy_avx2:
  518|  66.4k|    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
  519|  66.4k|  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
  520|  66.4k|  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|  66.4k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|  66.4k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  521|  66.4k|  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
  ------------------
  |  |   23|  66.4k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  522|  66.4k|  switch (w) {
  523|  36.3k|    case 4:
  ------------------
  |  Branch (523:5): [True: 36.3k, False: 30.1k]
  ------------------
  524|   170k|      do {
  525|   170k|        const __m128i v_ra_b = xx_loadl_64(mask);
  526|   170k|        const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
  527|   170k|        const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
  528|   170k|        const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
  529|   170k|        const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
  530|   170k|        const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
  531|   170k|        const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
  532|   170k|        const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
  533|   170k|        const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
  534|   170k|        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
  535|       |
  536|   170k|        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
  537|       |
  538|   170k|        xx_storel_32(dst, v_res_b);
  539|       |
  540|   170k|        dst += dst_stride;
  541|   170k|        src0 += src0_stride;
  542|   170k|        src1 += src1_stride;
  543|   170k|        mask += 2 * mask_stride;
  544|   170k|      } while (--h);
  ------------------
  |  Branch (544:16): [True: 134k, False: 36.3k]
  ------------------
  545|  36.3k|      break;
  546|  26.1k|    case 8:
  ------------------
  |  Branch (546:5): [True: 26.1k, False: 40.3k]
  ------------------
  547|   179k|      do {
  548|   179k|        const __m128i v_ra_b = xx_loadu_128(mask);
  549|   179k|        const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
  550|   179k|        const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
  551|   179k|        const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
  552|   179k|        const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
  553|   179k|        const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
  554|   179k|        const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
  555|   179k|        const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
  556|   179k|        const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
  557|   179k|        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
  558|       |
  559|   179k|        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
  560|       |
  561|   179k|        xx_storel_64(dst, v_res_b);
  562|       |
  563|   179k|        dst += dst_stride;
  564|   179k|        src0 += src0_stride;
  565|   179k|        src1 += src1_stride;
  566|   179k|        mask += 2 * mask_stride;
  567|   179k|      } while (--h);
  ------------------
  |  Branch (567:16): [True: 153k, False: 26.1k]
  ------------------
  568|  26.1k|      break;
  569|  4.02k|    case 16:
  ------------------
  |  Branch (569:5): [True: 4.02k, False: 62.4k]
  ------------------
  570|  4.02k|      blend_a64_mask_sx_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
  571|  4.02k|                                    src1_stride, mask, mask_stride, h);
  572|  4.02k|      break;
  573|      0|    default:
  ------------------
  |  Branch (573:5): [True: 0, False: 66.4k]
  ------------------
  574|      0|      blend_a64_mask_sx_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
  575|      0|                                     src1_stride, mask, mask_stride, w, h);
  576|      0|      break;
  577|  66.4k|  }
  578|  66.4k|}
blend_a64_mask_avx2.c:blend_a64_mask_sx_sy_w16_avx2:
  446|  4.02k|    const uint8_t *mask, uint32_t mask_stride, int h) {
  447|  4.02k|  const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
  448|  4.02k|  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|  4.02k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|  4.02k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  449|  48.7k|  do {
  450|  48.7k|    const __m256i v_ral_b = yy_loadu_256(mask);
  451|  48.7k|    const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride);
  452|  48.7k|    const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
  453|  48.7k|    const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
  454|  48.7k|    const __m256i v_rvsbl_w =
  455|  48.7k|        _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
  456|  48.7k|    const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
  457|       |
  458|  48.7k|    const __m256i v_m0_w = yy_roundn_epu16(v_rsl_w, 2);
  459|  48.7k|    const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, v_m0_w);
  460|  48.7k|    const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
  461|       |
  462|  48.7k|    const __m256i y_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
  463|  48.7k|                                             AOM_BLEND_A64_ROUND_BITS);
  ------------------
  |  |   23|  48.7k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  464|       |
  465|  48.7k|    xx_storeu_128(dst, _mm256_castsi256_si128(y_res_b));
  466|  48.7k|    dst += dst_stride;
  467|  48.7k|    src0 += src0_stride;
  468|  48.7k|    src1 += src1_stride;
  469|  48.7k|    mask += 2 * mask_stride;
  470|  48.7k|  } while (--h);
  ------------------
  |  Branch (470:12): [True: 44.7k, False: 4.02k]
  ------------------
  471|  4.02k|}
blend_a64_mask_avx2.c:blend_16_u8_avx2:
  407|  49.0k|                                       const int32_t bits) {
  408|  49.0k|  const __m256i v_s0_b = _mm256_castsi128_si256(xx_loadu_128(src0));
  409|  49.0k|  const __m256i v_s1_b = _mm256_castsi128_si256(xx_loadu_128(src1));
  410|  49.0k|  const __m256i v_s0_s_b = _mm256_permute4x64_epi64(v_s0_b, 0xd8);
  411|  49.0k|  const __m256i v_s1_s_b = _mm256_permute4x64_epi64(v_s1_b, 0xd8);
  412|       |
  413|  49.0k|  const __m256i v_p0_w =
  414|  49.0k|      _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_s_b, v_s1_s_b),
  415|  49.0k|                           _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
  416|       |
  417|  49.0k|  const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
  418|  49.0k|  const __m256i v_res_b = _mm256_packus_epi16(v_res0_w, v_res0_w);
  419|  49.0k|  const __m256i v_res = _mm256_permute4x64_epi64(v_res_b, 0xd8);
  420|  49.0k|  return v_res;
  421|  49.0k|}
blend_a64_mask_avx2.c:blend_32_u8_avx2:
  426|   339k|                                       const int32_t bits) {
  427|   339k|  const __m256i v_s0_b = yy_loadu_256(src0);
  428|   339k|  const __m256i v_s1_b = yy_loadu_256(src1);
  429|       |
  430|   339k|  const __m256i v_p0_w =
  431|   339k|      _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_b, v_s1_b),
  432|   339k|                           _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
  433|   339k|  const __m256i v_p1_w =
  434|   339k|      _mm256_maddubs_epi16(_mm256_unpackhi_epi8(v_s0_b, v_s1_b),
  435|   339k|                           _mm256_unpackhi_epi8(*v_m0_b, *v_m1_b));
  436|       |
  437|   339k|  const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
  438|   339k|  const __m256i v_res1_w = yy_roundn_epu16(v_p1_w, bits);
  439|   339k|  const __m256i v_res = _mm256_packus_epi16(v_res0_w, v_res1_w);
  440|   339k|  return v_res;
  441|   339k|}
blend_a64_mask_avx2.c:blend_a64_mask_sx_avx2:
  643|     36|    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
  644|     36|  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
  645|     36|  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|     36|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|     36|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  646|     36|  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
  ------------------
  |  |   23|     36|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  647|     36|  switch (w) {
  648|     18|    case 4:
  ------------------
  |  Branch (648:5): [True: 18, False: 18]
  ------------------
  649|    144|      do {
  650|    144|        const __m128i v_r_b = xx_loadl_64(mask);
  651|    144|        const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
  652|    144|        const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
  653|    144|        const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
  654|    144|        const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
  655|    144|        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
  656|       |
  657|    144|        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
  658|       |
  659|    144|        xx_storel_32(dst, v_res_b);
  660|       |
  661|    144|        dst += dst_stride;
  662|    144|        src0 += src0_stride;
  663|    144|        src1 += src1_stride;
  664|    144|        mask += mask_stride;
  665|    144|      } while (--h);
  ------------------
  |  Branch (665:16): [True: 126, False: 18]
  ------------------
  666|     18|      break;
  667|      8|    case 8:
  ------------------
  |  Branch (667:5): [True: 8, False: 28]
  ------------------
  668|     96|      do {
  669|     96|        const __m128i v_r_b = xx_loadu_128(mask);
  670|     96|        const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
  671|     96|        const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
  672|     96|        const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
  673|     96|        const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
  674|     96|        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
  675|       |
  676|     96|        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
  677|       |
  678|     96|        xx_storel_64(dst, v_res_b);
  679|       |
  680|     96|        dst += dst_stride;
  681|     96|        src0 += src0_stride;
  682|     96|        src1 += src1_stride;
  683|     96|        mask += mask_stride;
  684|     96|      } while (--h);
  ------------------
  |  Branch (684:16): [True: 88, False: 8]
  ------------------
  685|      8|      break;
  686|     10|    case 16:
  ------------------
  |  Branch (686:5): [True: 10, False: 26]
  ------------------
  687|     10|      blend_a64_mask_sx_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
  688|     10|                                 src1_stride, mask, mask_stride, h);
  689|     10|      break;
  690|      0|    default:
  ------------------
  |  Branch (690:5): [True: 0, False: 36]
  ------------------
  691|      0|      blend_a64_mask_sx_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
  692|      0|                                  src1_stride, mask, mask_stride, w, h);
  693|      0|      break;
  694|     36|  }
  695|     36|}
blend_a64_mask_avx2.c:blend_a64_mask_sx_w16_avx2:
  583|     10|    const uint8_t *mask, uint32_t mask_stride, int h) {
  584|     10|  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|     10|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|     10|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  585|     10|  const __m256i v_zmask_b = _mm256_set1_epi16(0xff);
  586|    256|  do {
  587|    256|    const __m256i v_rl_b = yy_loadu_256(mask);
  588|    256|    const __m256i v_al_b =
  589|    256|        _mm256_avg_epu8(v_rl_b, _mm256_srli_si256(v_rl_b, 1));
  590|       |
  591|    256|    const __m256i v_m0_w = _mm256_and_si256(v_al_b, v_zmask_b);
  592|    256|    const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, _mm256_setzero_si256());
  593|    256|    const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
  594|       |
  595|    256|    const __m256i v_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
  596|    256|                                             AOM_BLEND_A64_ROUND_BITS);
  ------------------
  |  |   23|    256|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  597|       |
  598|    256|    xx_storeu_128(dst, _mm256_castsi256_si128(v_res_b));
  599|    256|    dst += dst_stride;
  600|    256|    src0 += src0_stride;
  601|    256|    src1 += src1_stride;
  602|    256|    mask += mask_stride;
  603|    256|  } while (--h);
  ------------------
  |  Branch (603:12): [True: 246, False: 10]
  ------------------
  604|     10|}
blend_a64_mask_avx2.c:blend_a64_mask_avx2:
  818|   366k|    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
  819|   366k|  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|   366k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|   366k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  820|   366k|  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
  ------------------
  |  |   23|   366k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  821|   366k|  switch (w) {
  822|  69.7k|    case 4:
  ------------------
  |  Branch (822:5): [True: 69.7k, False: 297k]
  ------------------
  823|   369k|      do {
  824|   369k|        const __m128i v_m0_b = xx_loadl_32(mask);
  825|   369k|        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
  826|   369k|        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
  827|       |
  828|   369k|        xx_storel_32(dst, v_res_b);
  829|       |
  830|   369k|        dst += dst_stride;
  831|   369k|        src0 += src0_stride;
  832|   369k|        src1 += src1_stride;
  833|   369k|        mask += mask_stride;
  834|   369k|      } while (--h);
  ------------------
  |  Branch (834:16): [True: 299k, False: 69.7k]
  ------------------
  835|  69.7k|      break;
  836|   178k|    case 8:
  ------------------
  |  Branch (836:5): [True: 178k, False: 188k]
  ------------------
  837|  1.53M|      do {
  838|  1.53M|        const __m128i v_m0_b = xx_loadl_64(mask);
  839|  1.53M|        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
  840|  1.53M|        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
  841|       |
  842|  1.53M|        xx_storel_64(dst, v_res_b);
  843|       |
  844|  1.53M|        dst += dst_stride;
  845|  1.53M|        src0 += src0_stride;
  846|  1.53M|        src1 += src1_stride;
  847|  1.53M|        mask += mask_stride;
  848|  1.53M|      } while (--h);
  ------------------
  |  Branch (848:16): [True: 1.35M, False: 178k]
  ------------------
  849|   178k|      break;
  850|   102k|    case 16:
  ------------------
  |  Branch (850:5): [True: 102k, False: 263k]
  ------------------
  851|  1.44M|      do {
  852|  1.44M|        const __m128i v_m0_b = xx_loadu_128(mask);
  853|  1.44M|        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
  854|  1.44M|        const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
  855|       |
  856|  1.44M|        xx_storeu_128(dst, v_res_b);
  857|  1.44M|        dst += dst_stride;
  858|  1.44M|        src0 += src0_stride;
  859|  1.44M|        src1 += src1_stride;
  860|  1.44M|        mask += mask_stride;
  861|  1.44M|      } while (--h);
  ------------------
  |  Branch (861:16): [True: 1.34M, False: 102k]
  ------------------
  862|   102k|      break;
  863|  15.5k|    default:
  ------------------
  |  Branch (863:5): [True: 15.5k, False: 351k]
  ------------------
  864|  15.5k|      blend_a64_mask_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
  865|  15.5k|                               src1_stride, mask, mask_stride, w, h);
  866|   366k|  }
  867|   366k|}
blend_a64_mask_avx2.c:blend_a64_mask_w32n_avx2:
  795|  15.5k|    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
  796|  15.5k|  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|  15.5k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|  15.5k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  797|   339k|  do {
  798|   339k|    int c;
  799|   678k|    for (c = 0; c < w; c += 32) {
  ------------------
  |  Branch (799:17): [True: 339k, False: 339k]
  ------------------
  800|   339k|      const __m256i v_m0_b = yy_loadu_256(mask + c);
  801|   339k|      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
  802|       |
  803|   339k|      const __m256i v_res_b = blend_32_u8_avx2(
  804|   339k|          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
  ------------------
  |  |   23|   339k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  805|       |
  806|   339k|      yy_storeu_256(dst + c, v_res_b);
  807|   339k|    }
  808|   339k|    dst += dst_stride;
  809|   339k|    src0 += src0_stride;
  810|   339k|    src1 += src1_stride;
  811|   339k|    mask += mask_stride;
  812|   339k|  } while (--h);
  ------------------
  |  Branch (812:12): [True: 323k, False: 15.5k]
  ------------------
  813|  15.5k|}
blend_a64_mask_avx2.c:highbd_blend_a64_d16_mask_w4_avx2:
  911|   114k|    const __m256i *clip_high, const __m256i *mask_max) {
  912|       |  // Load 4x u16 pixels from each of 4 rows from each source
  913|   114k|  const __m256i s0 =
  914|   114k|      yy_loadu_4x64(src0 + 3 * src0_stride, src0 + 2 * src0_stride,
  915|   114k|                    src0 + 1 * src0_stride, src0 + 0 * src0_stride);
  916|   114k|  const __m256i s1 =
  917|   114k|      yy_loadu_4x64(src1 + 3 * src1_stride, src1 + 2 * src1_stride,
  918|   114k|                    src1 + 1 * src1_stride, src1 + 0 * src1_stride);
  919|       |  // Generate the inverse mask
  920|   114k|  const __m256i mask1 = _mm256_sub_epi16(*mask_max, *mask0);
  921|       |
  922|       |  // Multiply each mask by the respective source
  923|   114k|  const __m256i mul0_highs = _mm256_mulhi_epu16(*mask0, s0);
  924|   114k|  const __m256i mul0_lows = _mm256_mullo_epi16(*mask0, s0);
  925|   114k|  const __m256i mul0h = _mm256_unpackhi_epi16(mul0_lows, mul0_highs);
  926|   114k|  const __m256i mul0l = _mm256_unpacklo_epi16(mul0_lows, mul0_highs);
  927|       |  // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
  928|       |  // lanes Later, packs does the same again which cancels this out with no need
  929|       |  // for a permute.  The intermediate values being reordered makes no difference
  930|       |
  931|   114k|  const __m256i mul1_highs = _mm256_mulhi_epu16(mask1, s1);
  932|   114k|  const __m256i mul1_lows = _mm256_mullo_epi16(mask1, s1);
  933|   114k|  const __m256i mul1h = _mm256_unpackhi_epi16(mul1_lows, mul1_highs);
  934|   114k|  const __m256i mul1l = _mm256_unpacklo_epi16(mul1_lows, mul1_highs);
  935|       |
  936|   114k|  const __m256i sumh = _mm256_add_epi32(mul0h, mul1h);
  937|   114k|  const __m256i suml = _mm256_add_epi32(mul0l, mul1l);
  938|       |
  939|   114k|  const __m256i roundh =
  940|   114k|      _mm256_srai_epi32(_mm256_sub_epi32(sumh, *round_offset), shift);
  941|   114k|  const __m256i roundl =
  942|   114k|      _mm256_srai_epi32(_mm256_sub_epi32(suml, *round_offset), shift);
  943|       |
  944|   114k|  const __m256i pack = _mm256_packs_epi32(roundl, roundh);
  945|   114k|  const __m256i clip =
  946|   114k|      _mm256_min_epi16(_mm256_max_epi16(pack, *clip_low), *clip_high);
  947|       |
  948|       |  // _mm256_extract_epi64 doesn't exist on x86, so do it the old-fashioned way:
  949|   114k|  const __m128i cliph = _mm256_extracti128_si256(clip, 1);
  950|   114k|  xx_storel_64(dst + 3 * dst_stride, _mm_srli_si128(cliph, 8));
  951|   114k|  xx_storel_64(dst + 2 * dst_stride, cliph);
  952|   114k|  const __m128i clipl = _mm256_castsi256_si128(clip);
  953|   114k|  xx_storel_64(dst + 1 * dst_stride, _mm_srli_si128(clipl, 8));
  954|   114k|  xx_storel_64(dst + 0 * dst_stride, clipl);
  955|   114k|}
blend_a64_mask_avx2.c:highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2:
 1098|  36.6k|    const __m256i *mask_max) {
 1099|   115k|  do {
 1100|       |    // Load 8x u8 pixels from each of 4 rows in the mask
 1101|   115k|    const __m128i mask0a8 =
 1102|   115k|        _mm_set_epi64x(*(int64_t *)mask, *(uint64_t *)(mask + mask_stride));
 1103|   115k|    const __m128i mask0b8 =
 1104|   115k|        _mm_set_epi64x(*(int64_t *)(mask + 2 * mask_stride),
 1105|   115k|                       *(int64_t *)(mask + 3 * mask_stride));
 1106|   115k|    const __m256i mask0a = _mm256_cvtepu8_epi16(mask0a8);
 1107|   115k|    const __m256i mask0b = _mm256_cvtepu8_epi16(mask0b8);
 1108|       |
 1109|   115k|    highbd_blend_a64_d16_mask_w8_avx2(
 1110|   115k|        dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b,
 1111|   115k|        round_offset, shift, clip_low, clip_high, mask_max);
 1112|       |
 1113|   115k|    dst += dst_stride * 4;
 1114|   115k|    src0 += src0_stride * 4;
 1115|   115k|    src1 += src1_stride * 4;
 1116|   115k|    mask += mask_stride * 4;
 1117|   115k|  } while (h -= 4);
  ------------------
  |  Branch (1117:12): [True: 79.2k, False: 36.6k]
  ------------------
 1118|  36.6k|}
blend_a64_mask_avx2.c:highbd_blend_a64_d16_mask_w8_avx2:
 1025|   269k|    const __m256i *mask_max) {
 1026|       |  // Load 8x u16 pixels from each of 4 rows from each source
 1027|   269k|  const __m256i s0a =
 1028|   269k|      yy_loadu2_128(src0 + 0 * src0_stride, src0 + 1 * src0_stride);
 1029|   269k|  const __m256i s0b =
 1030|   269k|      yy_loadu2_128(src0 + 2 * src0_stride, src0 + 3 * src0_stride);
 1031|   269k|  const __m256i s1a =
 1032|   269k|      yy_loadu2_128(src1 + 0 * src1_stride, src1 + 1 * src1_stride);
 1033|   269k|  const __m256i s1b =
 1034|   269k|      yy_loadu2_128(src1 + 2 * src1_stride, src1 + 3 * src1_stride);
 1035|       |
 1036|       |  // Generate inverse masks
 1037|   269k|  const __m256i mask1a = _mm256_sub_epi16(*mask_max, *mask0a);
 1038|   269k|  const __m256i mask1b = _mm256_sub_epi16(*mask_max, *mask0b);
 1039|       |
 1040|       |  // Multiply sources by respective masks
 1041|   269k|  const __m256i mul0a_highs = _mm256_mulhi_epu16(*mask0a, s0a);
 1042|   269k|  const __m256i mul0a_lows = _mm256_mullo_epi16(*mask0a, s0a);
 1043|   269k|  const __m256i mul0ah = _mm256_unpackhi_epi16(mul0a_lows, mul0a_highs);
 1044|   269k|  const __m256i mul0al = _mm256_unpacklo_epi16(mul0a_lows, mul0a_highs);
 1045|       |  // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
 1046|       |  // lanes Later, packs does the same again which cancels this out with no need
 1047|       |  // for a permute.  The intermediate values being reordered makes no difference
 1048|       |
 1049|   269k|  const __m256i mul1a_highs = _mm256_mulhi_epu16(mask1a, s1a);
 1050|   269k|  const __m256i mul1a_lows = _mm256_mullo_epi16(mask1a, s1a);
 1051|   269k|  const __m256i mul1ah = _mm256_unpackhi_epi16(mul1a_lows, mul1a_highs);
 1052|   269k|  const __m256i mul1al = _mm256_unpacklo_epi16(mul1a_lows, mul1a_highs);
 1053|       |
 1054|   269k|  const __m256i sumah = _mm256_add_epi32(mul0ah, mul1ah);
 1055|   269k|  const __m256i sumal = _mm256_add_epi32(mul0al, mul1al);
 1056|       |
 1057|   269k|  const __m256i mul0b_highs = _mm256_mulhi_epu16(*mask0b, s0b);
 1058|   269k|  const __m256i mul0b_lows = _mm256_mullo_epi16(*mask0b, s0b);
 1059|   269k|  const __m256i mul0bh = _mm256_unpackhi_epi16(mul0b_lows, mul0b_highs);
 1060|   269k|  const __m256i mul0bl = _mm256_unpacklo_epi16(mul0b_lows, mul0b_highs);
 1061|       |
 1062|   269k|  const __m256i mul1b_highs = _mm256_mulhi_epu16(mask1b, s1b);
 1063|   269k|  const __m256i mul1b_lows = _mm256_mullo_epi16(mask1b, s1b);
 1064|   269k|  const __m256i mul1bh = _mm256_unpackhi_epi16(mul1b_lows, mul1b_highs);
 1065|   269k|  const __m256i mul1bl = _mm256_unpacklo_epi16(mul1b_lows, mul1b_highs);
 1066|       |
 1067|   269k|  const __m256i sumbh = _mm256_add_epi32(mul0bh, mul1bh);
 1068|   269k|  const __m256i sumbl = _mm256_add_epi32(mul0bl, mul1bl);
 1069|       |
 1070|       |  // Divide down each result, with rounding
 1071|   269k|  const __m256i roundah =
 1072|   269k|      _mm256_srai_epi32(_mm256_sub_epi32(sumah, *round_offset), shift);
 1073|   269k|  const __m256i roundal =
 1074|   269k|      _mm256_srai_epi32(_mm256_sub_epi32(sumal, *round_offset), shift);
 1075|   269k|  const __m256i roundbh =
 1076|   269k|      _mm256_srai_epi32(_mm256_sub_epi32(sumbh, *round_offset), shift);
 1077|   269k|  const __m256i roundbl =
 1078|   269k|      _mm256_srai_epi32(_mm256_sub_epi32(sumbl, *round_offset), shift);
 1079|       |
 1080|       |  // Pack each i32 down to an i16 with saturation, then clip to valid range
 1081|   269k|  const __m256i packa = _mm256_packs_epi32(roundal, roundah);
 1082|   269k|  const __m256i clipa =
 1083|   269k|      _mm256_min_epi16(_mm256_max_epi16(packa, *clip_low), *clip_high);
 1084|   269k|  const __m256i packb = _mm256_packs_epi32(roundbl, roundbh);
 1085|   269k|  const __m256i clipb =
 1086|   269k|      _mm256_min_epi16(_mm256_max_epi16(packb, *clip_low), *clip_high);
 1087|       |
 1088|       |  // Store 8x u16 pixels to each of 4 rows in the destination
 1089|   269k|  yy_storeu2_128(dst + 0 * dst_stride, dst + 1 * dst_stride, clipa);
 1090|   269k|  yy_storeu2_128(dst + 2 * dst_stride, dst + 3 * dst_stride, clipb);
 1091|   269k|}
blend_a64_mask_avx2.c:highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2:
 1235|  59.6k|    const __m256i *mask_max) {
 1236|   803k|  for (int i = 0; i < h; i += 2) {
  ------------------
  |  Branch (1236:19): [True: 743k, False: 59.6k]
  ------------------
 1237|  2.71M|    for (int j = 0; j < w; j += 16) {
  ------------------
  |  Branch (1237:21): [True: 1.97M, False: 743k]
  ------------------
 1238|       |      // Load 16x u8 alpha-mask values from each of two rows and pad to u16
 1239|  1.97M|      const __m128i masks_a8 = xx_loadu_128(mask + j);
 1240|  1.97M|      const __m128i masks_b8 = xx_loadu_128(mask + mask_stride + j);
 1241|  1.97M|      const __m256i mask0a = _mm256_cvtepu8_epi16(masks_a8);
 1242|  1.97M|      const __m256i mask0b = _mm256_cvtepu8_epi16(masks_b8);
 1243|       |
 1244|  1.97M|      highbd_blend_a64_d16_mask_w16_avx2(
 1245|  1.97M|          dst + j, dst_stride, src0 + j, src0_stride, src1 + j, src1_stride,
 1246|  1.97M|          &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max);
 1247|  1.97M|    }
 1248|   743k|    dst += dst_stride * 2;
 1249|   743k|    src0 += src0_stride * 2;
 1250|   743k|    src1 += src1_stride * 2;
 1251|   743k|    mask += mask_stride * 2;
 1252|   743k|  }
 1253|  59.6k|}
blend_a64_mask_avx2.c:highbd_blend_a64_d16_mask_w16_avx2:
 1165|  2.75M|    const __m256i *mask_max) {
 1166|       |  // Load 16x pixels from each of 2 rows from each source
 1167|  2.75M|  const __m256i s0a = yy_loadu_256(src0);
 1168|  2.75M|  const __m256i s0b = yy_loadu_256(src0 + src0_stride);
 1169|  2.75M|  const __m256i s1a = yy_loadu_256(src1);
 1170|  2.75M|  const __m256i s1b = yy_loadu_256(src1 + src1_stride);
 1171|       |
 1172|       |  // Calculate inverse masks
 1173|  2.75M|  const __m256i mask1a = _mm256_sub_epi16(*mask_max, *mask0a);
 1174|  2.75M|  const __m256i mask1b = _mm256_sub_epi16(*mask_max, *mask0b);
 1175|       |
 1176|       |  // Multiply each source by appropriate mask
 1177|  2.75M|  const __m256i mul0a_highs = _mm256_mulhi_epu16(*mask0a, s0a);
 1178|  2.75M|  const __m256i mul0a_lows = _mm256_mullo_epi16(*mask0a, s0a);
 1179|  2.75M|  const __m256i mul0ah = _mm256_unpackhi_epi16(mul0a_lows, mul0a_highs);
 1180|  2.75M|  const __m256i mul0al = _mm256_unpacklo_epi16(mul0a_lows, mul0a_highs);
 1181|       |  // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
 1182|       |  // lanes Later, packs does the same again which cancels this out with no need
 1183|       |  // for a permute.  The intermediate values being reordered makes no difference
 1184|       |
 1185|  2.75M|  const __m256i mul1a_highs = _mm256_mulhi_epu16(mask1a, s1a);
 1186|  2.75M|  const __m256i mul1a_lows = _mm256_mullo_epi16(mask1a, s1a);
 1187|  2.75M|  const __m256i mul1ah = _mm256_unpackhi_epi16(mul1a_lows, mul1a_highs);
 1188|  2.75M|  const __m256i mul1al = _mm256_unpacklo_epi16(mul1a_lows, mul1a_highs);
 1189|       |
 1190|  2.75M|  const __m256i mulah = _mm256_add_epi32(mul0ah, mul1ah);
 1191|  2.75M|  const __m256i mulal = _mm256_add_epi32(mul0al, mul1al);
 1192|       |
 1193|  2.75M|  const __m256i mul0b_highs = _mm256_mulhi_epu16(*mask0b, s0b);
 1194|  2.75M|  const __m256i mul0b_lows = _mm256_mullo_epi16(*mask0b, s0b);
 1195|  2.75M|  const __m256i mul0bh = _mm256_unpackhi_epi16(mul0b_lows, mul0b_highs);
 1196|  2.75M|  const __m256i mul0bl = _mm256_unpacklo_epi16(mul0b_lows, mul0b_highs);
 1197|       |
 1198|  2.75M|  const __m256i mul1b_highs = _mm256_mulhi_epu16(mask1b, s1b);
 1199|  2.75M|  const __m256i mul1b_lows = _mm256_mullo_epi16(mask1b, s1b);
 1200|  2.75M|  const __m256i mul1bh = _mm256_unpackhi_epi16(mul1b_lows, mul1b_highs);
 1201|  2.75M|  const __m256i mul1bl = _mm256_unpacklo_epi16(mul1b_lows, mul1b_highs);
 1202|       |
 1203|  2.75M|  const __m256i mulbh = _mm256_add_epi32(mul0bh, mul1bh);
 1204|  2.75M|  const __m256i mulbl = _mm256_add_epi32(mul0bl, mul1bl);
 1205|       |
 1206|  2.75M|  const __m256i resah =
 1207|  2.75M|      _mm256_srai_epi32(_mm256_sub_epi32(mulah, *round_offset), shift);
 1208|  2.75M|  const __m256i resal =
 1209|  2.75M|      _mm256_srai_epi32(_mm256_sub_epi32(mulal, *round_offset), shift);
 1210|  2.75M|  const __m256i resbh =
 1211|  2.75M|      _mm256_srai_epi32(_mm256_sub_epi32(mulbh, *round_offset), shift);
 1212|  2.75M|  const __m256i resbl =
 1213|  2.75M|      _mm256_srai_epi32(_mm256_sub_epi32(mulbl, *round_offset), shift);
 1214|       |
 1215|       |  // Signed saturating pack from i32 to i16:
 1216|  2.75M|  const __m256i packa = _mm256_packs_epi32(resal, resah);
 1217|  2.75M|  const __m256i packb = _mm256_packs_epi32(resbl, resbh);
 1218|       |
 1219|       |  // Clip the values to the valid range
 1220|  2.75M|  const __m256i clipa =
 1221|  2.75M|      _mm256_min_epi16(_mm256_max_epi16(packa, *clip_low), *clip_high);
 1222|  2.75M|  const __m256i clipb =
 1223|  2.75M|      _mm256_min_epi16(_mm256_max_epi16(packb, *clip_low), *clip_high);
 1224|       |
 1225|       |  // Store 16 pixels
 1226|  2.75M|  yy_storeu_256(dst, clipa);
 1227|  2.75M|  yy_storeu_256(dst + dst_stride, clipb);
 1228|  2.75M|}
blend_a64_mask_avx2.c:highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2:
  987|  72.0k|    const __m256i *clip_high, const __m256i *mask_max) {
  988|  72.0k|  const __m256i one_b = _mm256_set1_epi8(1);
  989|  72.0k|  const __m256i two_w = _mm256_set1_epi16(2);
  990|   114k|  do {
  991|       |    // Load 8 pixels from each of 8 rows of mask,
  992|       |    // (saturating) add together rows then use madd to add adjacent pixels
  993|       |    // Finally, divide each value by 4 (with rounding)
  994|   114k|    const __m256i m0246 =
  995|   114k|        _mm256_set_epi64x(*(int64_t *)(mask + 6 * mask_stride),
  996|   114k|                          *(int64_t *)(mask + 4 * mask_stride),
  997|   114k|                          *(int64_t *)(mask + 2 * mask_stride),
  998|   114k|                          *(int64_t *)(mask + 0 * mask_stride));
  999|   114k|    const __m256i m1357 =
 1000|   114k|        _mm256_set_epi64x(*(int64_t *)(mask + 7 * mask_stride),
 1001|   114k|                          *(int64_t *)(mask + 5 * mask_stride),
 1002|   114k|                          *(int64_t *)(mask + 3 * mask_stride),
 1003|   114k|                          *(int64_t *)(mask + 1 * mask_stride));
 1004|   114k|    const __m256i addrows = _mm256_adds_epu8(m0246, m1357);
 1005|   114k|    const __m256i adjacent = _mm256_maddubs_epi16(addrows, one_b);
 1006|   114k|    const __m256i mask0 =
 1007|   114k|        _mm256_srli_epi16(_mm256_add_epi16(adjacent, two_w), 2);
 1008|       |
 1009|   114k|    highbd_blend_a64_d16_mask_w4_avx2(dst, dst_stride, src0, src0_stride, src1,
 1010|   114k|                                      src1_stride, &mask0, round_offset, shift,
 1011|   114k|                                      clip_low, clip_high, mask_max);
 1012|       |
 1013|   114k|    dst += dst_stride * 4;
 1014|   114k|    src0 += src0_stride * 4;
 1015|   114k|    src1 += src1_stride * 4;
 1016|   114k|    mask += mask_stride * 8;
 1017|   114k|  } while (h -= 4);
  ------------------
  |  Branch (1017:12): [True: 42.0k, False: 72.0k]
  ------------------
 1018|  72.0k|}
blend_a64_mask_avx2.c:highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2:
 1125|  63.4k|    const __m256i *mask_max) {
 1126|  63.4k|  const __m256i one_b = _mm256_set1_epi8(1);
 1127|  63.4k|  const __m256i two_w = _mm256_set1_epi16(2);
 1128|   153k|  do {
 1129|       |    // Load 16x u8 pixels from each of 8 rows in the mask,
 1130|       |    // (saturating) add together rows then use madd to add adjacent pixels
 1131|       |    // Finally, divide each value by 4 (with rounding)
 1132|   153k|    const __m256i m02 =
 1133|   153k|        yy_loadu2_128(mask + 0 * mask_stride, mask + 2 * mask_stride);
 1134|   153k|    const __m256i m13 =
 1135|   153k|        yy_loadu2_128(mask + 1 * mask_stride, mask + 3 * mask_stride);
 1136|   153k|    const __m256i m0123 =
 1137|   153k|        _mm256_maddubs_epi16(_mm256_adds_epu8(m02, m13), one_b);
 1138|   153k|    const __m256i mask_0a =
 1139|   153k|        _mm256_srli_epi16(_mm256_add_epi16(m0123, two_w), 2);
 1140|   153k|    const __m256i m46 =
 1141|   153k|        yy_loadu2_128(mask + 4 * mask_stride, mask + 6 * mask_stride);
 1142|   153k|    const __m256i m57 =
 1143|   153k|        yy_loadu2_128(mask + 5 * mask_stride, mask + 7 * mask_stride);
 1144|   153k|    const __m256i m4567 =
 1145|   153k|        _mm256_maddubs_epi16(_mm256_adds_epu8(m46, m57), one_b);
 1146|   153k|    const __m256i mask_0b =
 1147|   153k|        _mm256_srli_epi16(_mm256_add_epi16(m4567, two_w), 2);
 1148|       |
 1149|   153k|    highbd_blend_a64_d16_mask_w8_avx2(
 1150|   153k|        dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_0a,
 1151|   153k|        &mask_0b, round_offset, shift, clip_low, clip_high, mask_max);
 1152|       |
 1153|   153k|    dst += dst_stride * 4;
 1154|   153k|    src0 += src0_stride * 4;
 1155|   153k|    src1 += src1_stride * 4;
 1156|   153k|    mask += mask_stride * 8;
 1157|   153k|  } while (h -= 4);
  ------------------
  |  Branch (1157:12): [True: 89.7k, False: 63.4k]
  ------------------
 1158|  63.4k|}
blend_a64_mask_avx2.c:highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2:
 1260|  53.0k|    const __m256i *mask_max) {
 1261|  53.0k|  const __m256i one_b = _mm256_set1_epi8(1);
 1262|  53.0k|  const __m256i two_w = _mm256_set1_epi16(2);
 1263|   468k|  for (int i = 0; i < h; i += 2) {
  ------------------
  |  Branch (1263:19): [True: 415k, False: 53.0k]
  ------------------
 1264|  1.21M|    for (int j = 0; j < w; j += 16) {
  ------------------
  |  Branch (1264:21): [True: 794k, False: 415k]
  ------------------
 1265|       |      // Load 32x u8 alpha-mask values from each of four rows
 1266|       |      // (saturating) add pairs of rows, then use madd to add adjacent values
 1267|       |      // Finally, divide down each result with rounding
 1268|   794k|      const __m256i m0 = yy_loadu_256(mask + 0 * mask_stride + 2 * j);
 1269|   794k|      const __m256i m1 = yy_loadu_256(mask + 1 * mask_stride + 2 * j);
 1270|   794k|      const __m256i m2 = yy_loadu_256(mask + 2 * mask_stride + 2 * j);
 1271|   794k|      const __m256i m3 = yy_loadu_256(mask + 3 * mask_stride + 2 * j);
 1272|       |
 1273|   794k|      const __m256i m01_8 = _mm256_adds_epu8(m0, m1);
 1274|   794k|      const __m256i m23_8 = _mm256_adds_epu8(m2, m3);
 1275|       |
 1276|   794k|      const __m256i m01 = _mm256_maddubs_epi16(m01_8, one_b);
 1277|   794k|      const __m256i m23 = _mm256_maddubs_epi16(m23_8, one_b);
 1278|       |
 1279|   794k|      const __m256i mask0a = _mm256_srli_epi16(_mm256_add_epi16(m01, two_w), 2);
 1280|   794k|      const __m256i mask0b = _mm256_srli_epi16(_mm256_add_epi16(m23, two_w), 2);
 1281|       |
 1282|   794k|      highbd_blend_a64_d16_mask_w16_avx2(
 1283|   794k|          dst + j, dst_stride, src0 + j, src0_stride, src1 + j, src1_stride,
 1284|   794k|          &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max);
 1285|   794k|    }
 1286|   415k|    dst += dst_stride * 2;
 1287|   415k|    src0 += src0_stride * 2;
 1288|   415k|    src1 += src1_stride * 2;
 1289|   415k|    mask += mask_stride * 4;
 1290|   415k|  }
 1291|  53.0k|}

aom_blend_a64_mask_sse4_1:
  389|   995k|                               int h, int subw, int subh) {
  390|   995k|  typedef void (*blend_fn)(
  391|   995k|      uint8_t * dst, uint32_t dst_stride, const uint8_t *src0,
  392|   995k|      uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
  393|   995k|      const uint8_t *mask, uint32_t mask_stride, int w, int h);
  394|       |
  395|       |  // Dimensions are: width_index X subx X suby
  396|   995k|  static const blend_fn blend[3][2][2] = {
  397|   995k|    { // w % 16 == 0
  398|   995k|      { blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 },
  399|   995k|      { blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } },
  400|   995k|    { // w == 4
  401|   995k|      { blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 },
  402|   995k|      { blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } },
  403|   995k|    { // w == 8
  404|   995k|      { blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 },
  405|   995k|      { blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } }
  406|   995k|  };
  407|       |
  408|   995k|  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
  409|   995k|  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
  410|       |
  411|   995k|  assert(h >= 1);
  412|   995k|  assert(w >= 1);
  413|   995k|  assert(IS_POWER_OF_TWO(h));
  414|   995k|  assert(IS_POWER_OF_TWO(w));
  415|       |
  416|   995k|  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
  ------------------
  |  |   55|   995k|#define UNLIKELY(v) __builtin_expect(v, 0)
  |  |  ------------------
  |  |  |  Branch (55:21): [True: 311k, False: 684k]
  |  |  ------------------
  ------------------
  417|   311k|    aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
  418|   311k|                         mask, mask_stride, w, h, subw, subh);
  419|   684k|  } else {
  420|   684k|    blend[(w >> 2) & 3][subw != 0][subh != 0](dst, dst_stride, src0,
  421|   684k|                                              src0_stride, src1, src1_stride,
  422|   684k|                                              mask, mask_stride, w, h);
  423|   684k|  }
  424|   995k|}
aom_highbd_blend_a64_mask_sse4_1:
  822|  1.39M|                                      int subw, int subh, int bd) {
  823|  1.39M|  typedef void (*blend_fn)(
  824|  1.39M|      uint16_t * dst, uint32_t dst_stride, const uint16_t *src0,
  825|  1.39M|      uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
  826|  1.39M|      const uint8_t *mask, uint32_t mask_stride, int w, int h);
  827|       |
  828|       |  // Dimensions are: bd_index X width_index X subw X subh
  829|  1.39M|  static const blend_fn blend[2][2][2][2] = {
  830|  1.39M|    {   // bd == 8 or 10
  831|  1.39M|      { // w % 8 == 0
  832|  1.39M|        { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 },
  833|  1.39M|        { blend_a64_mask_b10_sx_w8n_sse4_1,
  834|  1.39M|          blend_a64_mask_b10_sx_sy_w8n_sse4_1 } },
  835|  1.39M|      { // w == 4
  836|  1.39M|        { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 },
  837|  1.39M|        { blend_a64_mask_b10_sx_w4_sse4_1,
  838|  1.39M|          blend_a64_mask_b10_sx_sy_w4_sse4_1 } } },
  839|  1.39M|    {   // bd == 12
  840|  1.39M|      { // w % 8 == 0
  841|  1.39M|        { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 },
  842|  1.39M|        { blend_a64_mask_b12_sx_w8n_sse4_1,
  843|  1.39M|          blend_a64_mask_b12_sx_sy_w8n_sse4_1 } },
  844|  1.39M|      { // w == 4
  845|  1.39M|        { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 },
  846|  1.39M|        { blend_a64_mask_b12_sx_w4_sse4_1,
  847|  1.39M|          blend_a64_mask_b12_sx_sy_w4_sse4_1 } } }
  848|  1.39M|  };
  849|       |
  850|  1.39M|  assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
  851|  1.39M|  assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
  852|       |
  853|  1.39M|  assert(h >= 1);
  854|  1.39M|  assert(w >= 1);
  855|  1.39M|  assert(IS_POWER_OF_TWO(h));
  856|  1.39M|  assert(IS_POWER_OF_TWO(w));
  857|       |
  858|  1.39M|  assert(bd == 8 || bd == 10 || bd == 12);
  859|  1.39M|  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
  ------------------
  |  |   55|  1.39M|#define UNLIKELY(v) __builtin_expect(v, 0)
  |  |  ------------------
  |  |  |  Branch (55:21): [True: 213k, False: 1.17M]
  |  |  ------------------
  ------------------
  860|   213k|    aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
  861|   213k|                                src1_stride, mask, mask_stride, w, h, subw,
  862|   213k|                                subh, bd);
  863|  1.17M|  } else {
  864|  1.17M|    uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
  ------------------
  |  |   75|  1.17M|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  865|  1.17M|    const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
  ------------------
  |  |   75|  1.17M|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  866|  1.17M|    const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
  ------------------
  |  |   75|  1.17M|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  867|       |
  868|  1.17M|    blend[bd == 12][(w >> 2) & 1][subw != 0][subh != 0](
  869|  1.17M|        dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
  870|  1.17M|        mask_stride, w, h);
  871|  1.17M|  }
  872|  1.39M|}
blend_a64_mask_sse4.c:blend_a64_mask_w16n_sse4_1:
   76|  54.9k|    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   77|  54.9k|  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|  54.9k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|  54.9k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
   78|  54.9k|  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
  ------------------
  |  |   23|  54.9k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
   79|       |
   80|   895k|  do {
   81|   895k|    int c;
   82|  1.94M|    for (c = 0; c < w; c += 16) {
  ------------------
  |  Branch (82:17): [True: 1.05M, False: 895k]
  ------------------
   83|  1.05M|      const __m128i v_m0_b = xx_loadu_128(mask + c);
   84|  1.05M|      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
   85|       |
   86|  1.05M|      const __m128i v_res_b =
   87|  1.05M|          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
   88|       |
   89|  1.05M|      xx_storeu_128(dst + c, v_res_b);
   90|  1.05M|    }
   91|   895k|    dst += dst_stride;
   92|   895k|    src0 += src0_stride;
   93|   895k|    src1 += src1_stride;
   94|   895k|    mask += mask_stride;
   95|   895k|  } while (--h);
  ------------------
  |  Branch (95:12): [True: 840k, False: 54.9k]
  ------------------
   96|  54.9k|}
blend_a64_mask_sse4.c:blend_a64_mask_w4_sse4_1:
   35|   417k|                                     int w, int h) {
   36|   417k|  (void)w;
   37|   417k|  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|   417k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|   417k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
   38|   417k|  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
  ------------------
  |  |   23|   417k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
   39|  3.41M|  do {
   40|  3.41M|    const __m128i v_m0_b = xx_loadl_32(mask);
   41|  3.41M|    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
   42|  3.41M|    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
   43|  3.41M|    xx_storel_32(dst, v_res_b);
   44|       |
   45|  3.41M|    dst += dst_stride;
   46|  3.41M|    src0 += src0_stride;
   47|  3.41M|    src1 += src1_stride;
   48|  3.41M|    mask += mask_stride;
   49|  3.41M|  } while (--h);
  ------------------
  |  Branch (49:12): [True: 2.99M, False: 417k]
  ------------------
   50|   417k|}
blend_a64_mask_sse4.c:blend_a64_mask_w8_sse4_1:
   56|   211k|                                     int w, int h) {
   57|   211k|  (void)w;
   58|   211k|  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|   211k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|   211k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
   59|   211k|  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
  ------------------
  |  |   23|   211k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
   60|  2.22M|  do {
   61|  2.22M|    const __m128i v_m0_b = xx_loadl_64(mask);
   62|  2.22M|    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
   63|  2.22M|    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
   64|  2.22M|    xx_storel_64(dst, v_res_b);
   65|       |
   66|  2.22M|    dst += dst_stride;
   67|  2.22M|    src0 += src0_stride;
   68|  2.22M|    src1 += src1_stride;
   69|  2.22M|    mask += mask_stride;
   70|  2.22M|  } while (--h);
  ------------------
  |  Branch (70:12): [True: 2.01M, False: 211k]
  ------------------
   71|   211k|}
blend_a64_mask_sse4.c:blend_a64_mask_b10_w8n_sse4_1:
  499|   572k|    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
  500|   572k|  blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
  501|   572k|                               src1_stride, mask, mask_stride, w, h,
  502|   572k|                               blend_8_b10);
  503|   572k|}
blend_a64_mask_sse4.c:blend_a64_mask_bn_w8n_sse4_1:
  475|   606k|    blend_unit_fn blend) {
  476|   606k|  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|   606k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|   606k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  477|       |
  478|  6.67M|  do {
  479|  6.67M|    int c;
  480|  16.9M|    for (c = 0; c < w; c += 8) {
  ------------------
  |  Branch (480:17): [True: 10.2M, False: 6.67M]
  ------------------
  481|  10.2M|      const __m128i v_m0_b = xx_loadl_64(mask + c);
  482|  10.2M|      const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
  483|  10.2M|      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
  484|       |
  485|  10.2M|      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
  486|       |
  487|  10.2M|      xx_storeu_128(dst + c, v_res_w);
  488|  10.2M|    }
  489|  6.67M|    dst += dst_stride;
  490|  6.67M|    src0 += src0_stride;
  491|  6.67M|    src1 += src1_stride;
  492|  6.67M|    mask += mask_stride;
  493|  6.67M|  } while (--h);
  ------------------
  |  Branch (493:12): [True: 6.06M, False: 606k]
  ------------------
  494|   606k|}
blend_a64_mask_sse4.c:blend_a64_mask_b10_sx_w8n_sse4_1:
  596|     14|    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
  597|     14|  blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
  598|     14|                                  src1_stride, mask, mask_stride, w, h,
  599|     14|                                  blend_8_b10);
  600|     14|}
blend_a64_mask_sse4.c:blend_a64_mask_bn_sx_w8n_sse4_1:
  568|     38|    blend_unit_fn blend) {
  569|     38|  const __m128i v_zmask_b =
  570|     38|      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
  571|     38|  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|     38|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|     38|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  572|       |
  573|    688|  do {
  574|    688|    int c;
  575|  1.82k|    for (c = 0; c < w; c += 8) {
  ------------------
  |  Branch (575:17): [True: 1.13k, False: 688]
  ------------------
  576|  1.13k|      const __m128i v_r_b = xx_loadu_128(mask + 2 * c);
  577|  1.13k|      const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
  578|       |
  579|  1.13k|      const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
  580|  1.13k|      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
  581|       |
  582|  1.13k|      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
  583|       |
  584|  1.13k|      xx_storeu_128(dst + c, v_res_w);
  585|  1.13k|    }
  586|    688|    dst += dst_stride;
  587|    688|    src0 += src0_stride;
  588|    688|    src1 += src1_stride;
  589|    688|    mask += mask_stride;
  590|    688|  } while (--h);
  ------------------
  |  Branch (590:12): [True: 650, False: 38]
  ------------------
  591|     38|}
blend_a64_mask_sse4.c:blend_a64_mask_b10_sx_sy_w8n_sse4_1:
  798|  50.8k|    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
  799|  50.8k|  blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
  800|  50.8k|                                     src1_stride, mask, mask_stride, w, h,
  801|  50.8k|                                     blend_8_b10);
  802|  50.8k|}
blend_a64_mask_sse4.c:blend_a64_mask_bn_sx_sy_w8n_sse4_1:
  765|  51.8k|    blend_unit_fn blend) {
  766|  51.8k|  const __m128i v_zmask_b =
  767|  51.8k|      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
  768|  51.8k|  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|  51.8k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|  51.8k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  769|       |
  770|   360k|  do {
  771|   360k|    int c;
  772|   764k|    for (c = 0; c < w; c += 8) {
  ------------------
  |  Branch (772:17): [True: 403k, False: 360k]
  ------------------
  773|   403k|      const __m128i v_ra_b = xx_loadu_128(mask + 2 * c);
  774|   403k|      const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride);
  775|   403k|      const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
  776|   403k|      const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
  777|   403k|      const __m128i v_rvsb_w =
  778|   403k|          _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
  779|   403k|      const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
  780|       |
  781|   403k|      const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
  782|   403k|      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
  783|       |
  784|   403k|      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
  785|       |
  786|   403k|      xx_storeu_128(dst + c, v_res_w);
  787|   403k|    }
  788|   360k|    dst += dst_stride;
  789|   360k|    src0 += src0_stride;
  790|   360k|    src1 += src1_stride;
  791|   360k|    mask += 2 * mask_stride;
  792|   360k|  } while (--h);
  ------------------
  |  Branch (792:12): [True: 308k, False: 51.8k]
  ------------------
  793|  51.8k|}
blend_a64_mask_sse4.c:blend_a64_mask_b10_w4_sse4_1:
  456|   442k|    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
  457|   442k|  (void)w;
  458|   442k|  blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
  459|   442k|                              src1_stride, mask, mask_stride, h, blend_4_b10);
  460|   442k|}
blend_a64_mask_sse4.c:blend_a64_mask_bn_w4_sse4_1:
  434|   458k|    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
  435|   458k|  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|   458k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|   458k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  436|       |
  437|  3.07M|  do {
  438|  3.07M|    const __m128i v_m0_b = xx_loadl_32(mask);
  439|  3.07M|    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
  440|  3.07M|    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
  441|       |
  442|  3.07M|    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
  443|       |
  444|  3.07M|    xx_storel_64(dst, v_res_w);
  445|       |
  446|  3.07M|    dst += dst_stride;
  447|  3.07M|    src0 += src0_stride;
  448|  3.07M|    src1 += src1_stride;
  449|  3.07M|    mask += mask_stride;
  450|  3.07M|  } while (--h);
  ------------------
  |  Branch (450:12): [True: 2.61M, False: 458k]
  ------------------
  451|   458k|}
blend_a64_mask_sse4.c:blend_a64_mask_b10_sx_w4_sse4_1:
  547|     12|    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
  548|     12|  (void)w;
  549|     12|  blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
  550|     12|                                 src1_stride, mask, mask_stride, h,
  551|     12|                                 blend_4_b10);
  552|     12|}
blend_a64_mask_sse4.c:blend_a64_mask_bn_sx_w4_sse4_1:
  521|     38|    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
  522|     38|  const __m128i v_zmask_b =
  523|     38|      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
  524|     38|  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|     38|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|     38|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  525|       |
  526|    304|  do {
  527|    304|    const __m128i v_r_b = xx_loadl_64(mask);
  528|    304|    const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
  529|       |
  530|    304|    const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
  531|    304|    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
  532|       |
  533|    304|    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
  534|       |
  535|    304|    xx_storel_64(dst, v_res_w);
  536|       |
  537|    304|    dst += dst_stride;
  538|    304|    src0 += src0_stride;
  539|    304|    src1 += src1_stride;
  540|    304|    mask += mask_stride;
  541|    304|  } while (--h);
  ------------------
  |  Branch (541:12): [True: 266, False: 38]
  ------------------
  542|     38|}
blend_a64_mask_sse4.c:blend_a64_mask_b10_sx_sy_w4_sse4_1:
  744|  58.4k|    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
  745|  58.4k|  (void)w;
  746|  58.4k|  blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
  747|  58.4k|                                    src1_stride, mask, mask_stride, h,
  748|  58.4k|                                    blend_4_b10);
  749|  58.4k|}
blend_a64_mask_sse4.c:blend_a64_mask_bn_sx_sy_w4_sse4_1:
  713|  61.6k|    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
  714|  61.6k|  const __m128i v_zmask_b =
  715|  61.6k|      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
  716|  61.6k|  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|  61.6k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|  61.6k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  717|       |
  718|   280k|  do {
  719|   280k|    const __m128i v_ra_b = xx_loadl_64(mask);
  720|   280k|    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
  721|   280k|    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
  722|   280k|    const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
  723|   280k|    const __m128i v_rvsb_w =
  724|   280k|        _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
  725|   280k|    const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
  726|       |
  727|   280k|    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
  728|   280k|    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
  729|       |
  730|   280k|    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
  731|       |
  732|   280k|    xx_storel_64(dst, v_res_w);
  733|       |
  734|   280k|    dst += dst_stride;
  735|   280k|    src0 += src0_stride;
  736|   280k|    src1 += src1_stride;
  737|   280k|    mask += 2 * mask_stride;
  738|   280k|  } while (--h);
  ------------------
  |  Branch (738:12): [True: 219k, False: 61.6k]
  ------------------
  739|  61.6k|}
blend_a64_mask_sse4.c:blend_a64_mask_b12_w8n_sse4_1:
  508|  33.5k|    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
  509|  33.5k|  blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
  510|  33.5k|                               src1_stride, mask, mask_stride, w, h,
  511|  33.5k|                               blend_8_b12);
  512|  33.5k|}
blend_a64_mask_sse4.c:blend_a64_mask_b12_sx_w8n_sse4_1:
  605|     24|    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
  606|     24|  blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
  607|     24|                                  src1_stride, mask, mask_stride, w, h,
  608|     24|                                  blend_8_b12);
  609|     24|}
blend_a64_mask_sse4.c:blend_a64_mask_b12_sx_sy_w8n_sse4_1:
  807|    968|    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
  808|    968|  blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
  809|    968|                                     src1_stride, mask, mask_stride, w, h,
  810|    968|                                     blend_8_b12);
  811|    968|}
blend_a64_mask_sse4.c:blend_a64_mask_b12_w4_sse4_1:
  465|  16.2k|    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
  466|  16.2k|  (void)w;
  467|  16.2k|  blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
  468|  16.2k|                              src1_stride, mask, mask_stride, h, blend_4_b12);
  469|  16.2k|}
blend_a64_mask_sse4.c:blend_a64_mask_b12_sx_w4_sse4_1:
  557|     26|    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
  558|     26|  (void)w;
  559|     26|  blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
  560|     26|                                 src1_stride, mask, mask_stride, h,
  561|     26|                                 blend_4_b12);
  562|     26|}
blend_a64_mask_sse4.c:blend_a64_mask_b12_sx_sy_w4_sse4_1:
  754|  3.18k|    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
  755|  3.18k|  (void)w;
  756|  3.18k|  blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
  757|  3.18k|                                    src1_stride, mask, mask_stride, h,
  758|  3.18k|                                    blend_4_b12);
  759|  3.18k|}

aom_blend_a64_vmask_sse4_1:
  115|   554k|                                const uint8_t *mask, int w, int h) {
  116|   554k|  typedef void (*blend_fn)(uint8_t * dst, uint32_t dst_stride,
  117|   554k|                           const uint8_t *src0, uint32_t src0_stride,
  118|   554k|                           const uint8_t *src1, uint32_t src1_stride,
  119|   554k|                           const uint8_t *mask, int w, int h);
  120|       |
  121|       |  // Dimension: width_index
  122|   554k|  static const blend_fn blend[9] = {
  123|   554k|    blend_a64_vmask_w16n_sse4_1,  // w % 16 == 0
  124|   554k|    aom_blend_a64_vmask_c,        // w == 1
  125|   554k|    aom_blend_a64_vmask_c,        // w == 2
  126|   554k|    NULL,                         // INVALID
  127|   554k|    blend_a64_vmask_w4_sse4_1,    // w == 4
  128|   554k|    NULL,                         // INVALID
  129|   554k|    NULL,                         // INVALID
  130|   554k|    NULL,                         // INVALID
  131|   554k|    blend_a64_vmask_w8_sse4_1,    // w == 8
  132|   554k|  };
  133|       |
  134|   554k|  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
  135|   554k|  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
  136|       |
  137|   554k|  assert(h >= 1);
  138|   554k|  assert(w >= 1);
  139|   554k|  assert(IS_POWER_OF_TWO(h));
  140|   554k|  assert(IS_POWER_OF_TWO(w));
  141|       |
  142|   554k|  blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w,
  143|   554k|                 h);
  144|   554k|}
aom_highbd_blend_a64_vmask_sse4_1:
  243|   467k|    const uint8_t *mask, int w, int h, int bd) {
  244|   467k|  typedef void (*blend_fn)(uint16_t * dst, uint32_t dst_stride,
  245|   467k|                           const uint16_t *src0, uint32_t src0_stride,
  246|   467k|                           const uint16_t *src1, uint32_t src1_stride,
  247|   467k|                           const uint8_t *mask, int w, int h);
  248|       |
  249|       |  // Dimensions are: bd_index X width_index
  250|   467k|  static const blend_fn blend[2][2] = {
  251|   467k|    {
  252|       |        // bd == 8 or 10
  253|   467k|        blend_a64_vmask_b10_w8n_sse4_1,  // w % 8 == 0
  254|   467k|        blend_a64_vmask_b10_w4_sse4_1,   // w == 4
  255|   467k|    },
  256|   467k|    {
  257|       |        // bd == 12
  258|   467k|        blend_a64_vmask_b12_w8n_sse4_1,  // w % 8 == 0
  259|   467k|        blend_a64_vmask_b12_w4_sse4_1,   // w == 4
  260|   467k|    }
  261|   467k|  };
  262|       |
  263|   467k|  assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
  264|   467k|  assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
  265|       |
  266|   467k|  assert(h >= 1);
  267|   467k|  assert(w >= 1);
  268|   467k|  assert(IS_POWER_OF_TWO(h));
  269|   467k|  assert(IS_POWER_OF_TWO(w));
  270|       |
  271|   467k|  assert(bd == 8 || bd == 10 || bd == 12);
  272|       |
  273|   467k|  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
  ------------------
  |  |   55|   467k|#define UNLIKELY(v) __builtin_expect(v, 0)
  |  |  ------------------
  |  |  |  Branch (55:21): [True: 25.7k, False: 441k]
  |  |  ------------------
  ------------------
  274|  25.7k|    aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
  275|  25.7k|                                 src1_stride, mask, w, h, bd);
  276|   441k|  } else {
  277|   441k|    uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
  ------------------
  |  |   75|   441k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  278|   441k|    const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
  ------------------
  |  |   75|   441k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  279|   441k|    const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
  ------------------
  |  |   75|   441k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  280|       |
  281|   441k|    blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1,
  282|   441k|                                  src1_stride, mask, w, h);
  283|   441k|  }
  284|   467k|}
blend_a64_vmask_sse4.c:blend_a64_vmask_w16n_sse4_1:
   85|   198k|                                        const uint8_t *mask, int w, int h) {
   86|   198k|  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|   198k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|   198k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
   87|       |
   88|  1.32M|  do {
   89|  1.32M|    int c;
   90|  1.32M|    const __m128i v_m0_w = _mm_set1_epi16(*mask);
   91|  1.32M|    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
   92|  3.06M|    for (c = 0; c < w; c += 16) {
  ------------------
  |  Branch (92:17): [True: 1.74M, False: 1.32M]
  ------------------
   93|  1.74M|      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, &v_m0_w, &v_m1_w);
   94|  1.74M|      const __m128i v_resh_w =
   95|  1.74M|          blend_8(src0 + c + 8, src1 + c + 8, &v_m0_w, &v_m1_w);
   96|       |
   97|  1.74M|      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
   98|       |
   99|  1.74M|      xx_storeu_128(dst + c, v_res_b);
  100|  1.74M|    }
  101|  1.32M|    dst += dst_stride;
  102|  1.32M|    src0 += src0_stride;
  103|  1.32M|    src1 += src1_stride;
  104|  1.32M|    mask += 1;
  105|  1.32M|  } while (--h);
  ------------------
  |  Branch (105:12): [True: 1.12M, False: 198k]
  ------------------
  106|   198k|}
blend_a64_vmask_sse4.c:blend_a64_vmask_w4_sse4_1:
   33|  52.6k|                                      const uint8_t *mask, int w, int h) {
   34|  52.6k|  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|  52.6k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|  52.6k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
   35|       |
   36|  52.6k|  (void)w;
   37|       |
   38|   324k|  do {
   39|   324k|    const __m128i v_m0_w = _mm_set1_epi16(*mask);
   40|   324k|    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
   41|       |
   42|   324k|    const __m128i v_res_w = blend_4(src0, src1, &v_m0_w, &v_m1_w);
   43|       |
   44|   324k|    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
   45|       |
   46|   324k|    xx_storel_32(dst, v_res_b);
   47|       |
   48|   324k|    dst += dst_stride;
   49|   324k|    src0 += src0_stride;
   50|   324k|    src1 += src1_stride;
   51|   324k|    mask += 1;
   52|   324k|  } while (--h);
  ------------------
  |  Branch (52:12): [True: 271k, False: 52.6k]
  ------------------
   53|  52.6k|}
blend_a64_vmask_sse4.c:blend_a64_vmask_w8_sse4_1:
   58|   303k|                                      const uint8_t *mask, int w, int h) {
   59|   303k|  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|   303k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|   303k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
   60|       |
   61|   303k|  (void)w;
   62|       |
   63|  1.83M|  do {
   64|  1.83M|    const __m128i v_m0_w = _mm_set1_epi16(*mask);
   65|  1.83M|    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
   66|       |
   67|  1.83M|    const __m128i v_res_w = blend_8(src0, src1, &v_m0_w, &v_m1_w);
   68|       |
   69|  1.83M|    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
   70|       |
   71|  1.83M|    xx_storel_64(dst, v_res_b);
   72|       |
   73|  1.83M|    dst += dst_stride;
   74|  1.83M|    src0 += src0_stride;
   75|  1.83M|    src1 += src1_stride;
   76|  1.83M|    mask += 1;
   77|  1.83M|  } while (--h);
  ------------------
  |  Branch (77:12): [True: 1.52M, False: 303k]
  ------------------
   78|   303k|}
blend_a64_vmask_sse4.c:blend_a64_vmask_b10_w8n_sse4_1:
  221|   382k|                                           const uint8_t *mask, int w, int h) {
  222|   382k|  blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
  223|   382k|                                src1_stride, mask, w, h, blend_8_b10);
  224|   382k|}
blend_a64_vmask_sse4.c:blend_a64_vmask_bn_w8n_sse4_1:
  197|   403k|    const uint8_t *mask, int w, int h, blend_unit_fn blend) {
  198|   403k|  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|   403k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|   403k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  199|       |
  200|  2.48M|  do {
  201|  2.48M|    int c;
  202|  2.48M|    const __m128i v_m0_w = _mm_set1_epi16(*mask);
  203|  2.48M|    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
  204|  6.82M|    for (c = 0; c < w; c += 8) {
  ------------------
  |  Branch (204:17): [True: 4.33M, False: 2.48M]
  ------------------
  205|  4.33M|      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
  206|       |
  207|  4.33M|      xx_storeu_128(dst + c, v_res_w);
  208|  4.33M|    }
  209|  2.48M|    dst += dst_stride;
  210|  2.48M|    src0 += src0_stride;
  211|  2.48M|    src1 += src1_stride;
  212|  2.48M|    mask += 1;
  213|  2.48M|  } while (--h);
  ------------------
  |  Branch (213:12): [True: 2.08M, False: 403k]
  ------------------
  214|   403k|}
blend_a64_vmask_sse4.c:blend_a64_vmask_b10_w4_sse4_1:
  177|  36.7k|                                          const uint8_t *mask, int w, int h) {
  178|  36.7k|  (void)w;
  179|  36.7k|  blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
  180|  36.7k|                               src1_stride, mask, h, blend_4_b10);
  181|  36.7k|}
blend_a64_vmask_sse4.c:blend_a64_vmask_bn_w4_sse4_1:
  154|  38.1k|    const uint8_t *mask, int h, blend_unit_fn blend) {
  155|  38.1k|  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|  38.1k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|  38.1k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  156|       |
  157|   230k|  do {
  158|   230k|    const __m128i v_m0_w = _mm_set1_epi16(*mask);
  159|   230k|    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
  160|       |
  161|   230k|    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
  162|       |
  163|   230k|    xx_storel_64(dst, v_res_w);
  164|       |
  165|   230k|    dst += dst_stride;
  166|   230k|    src0 += src0_stride;
  167|   230k|    src1 += src1_stride;
  168|   230k|    mask += 1;
  169|   230k|  } while (--h);
  ------------------
  |  Branch (169:12): [True: 191k, False: 38.1k]
  ------------------
  170|  38.1k|}
blend_a64_vmask_sse4.c:blend_a64_vmask_b12_w8n_sse4_1:
  231|  21.2k|                                           const uint8_t *mask, int w, int h) {
  232|  21.2k|  blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
  233|  21.2k|                                src1_stride, mask, w, h, blend_8_b12);
  234|  21.2k|}
blend_a64_vmask_sse4.c:blend_a64_vmask_b12_w4_sse4_1:
  188|  1.38k|                                          const uint8_t *mask, int w, int h) {
  189|  1.38k|  (void)w;
  190|  1.38k|  blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
  191|  1.38k|                               src1_stride, mask, h, blend_4_b12);
  192|  1.38k|}

blend_a64_mask_avx2.c:blend_a64_d16_mask_w4_sse41:
   30|   193k|    int shift) {
   31|   193k|  const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
   32|   193k|  const __m128i s0 = xx_loadl_64(src0);
   33|   193k|  const __m128i s1 = xx_loadl_64(src1);
   34|   193k|  const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1);
   35|   193k|  const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m);
   36|   193k|  const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m);
   37|   193k|  const __m128i res_c = _mm_sub_epi32(res_a, *v_round_offset);
   38|   193k|  const __m128i res_d = _mm_srai_epi32(res_c, shift);
   39|   193k|  const __m128i res_e = _mm_packs_epi32(res_d, res_d);
   40|   193k|  const __m128i res = _mm_packus_epi16(res_e, res_e);
   41|       |
   42|   193k|  xx_storel_32(dst, res);
   43|   193k|}
blend_a64_mask_avx2.c:aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1:
   87|  16.4k|    const __m128i *round_offset, int shift) {
   88|  16.4k|  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|  16.4k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|  16.4k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
   89|   218k|  for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (89:19): [True: 201k, False: 16.4k]
  ------------------
   90|   201k|    const __m128i m0 = xx_loadl_64(mask);
   91|   201k|    const __m128i m = _mm_cvtepu8_epi16(m0);
   92|   201k|    blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
   93|   201k|                                shift);
   94|   201k|    mask += mask_stride;
   95|   201k|    dst += dst_stride;
   96|   201k|    src0 += src0_stride;
   97|   201k|    src1 += src1_stride;
   98|   201k|  }
   99|  16.4k|}
blend_a64_mask_avx2.c:blend_a64_d16_mask_w8_sse41:
   48|   615k|    int shift) {
   49|   615k|  const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
   50|   615k|  const __m128i s0 = xx_loadu_128(src0);
   51|   615k|  const __m128i s1 = xx_loadu_128(src1);
   52|   615k|  __m128i res_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0, s1),
   53|   615k|                                  _mm_unpacklo_epi16(*m, max_minus_m));
   54|   615k|  __m128i res_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0, s1),
   55|   615k|                                  _mm_unpackhi_epi16(*m, max_minus_m));
   56|   615k|  res_lo = _mm_srai_epi32(_mm_sub_epi32(res_lo, *v_round_offset), shift);
   57|   615k|  res_hi = _mm_srai_epi32(_mm_sub_epi32(res_hi, *v_round_offset), shift);
   58|   615k|  const __m128i res_e = _mm_packs_epi32(res_lo, res_hi);
   59|   615k|  const __m128i res = _mm_packus_epi16(res_e, res_e);
   60|       |
   61|   615k|  _mm_storel_epi64((__m128i *)(dst), res);
   62|   615k|}
blend_a64_mask_avx2.c:aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1:
  105|  31.5k|    const __m128i *round_offset, int shift) {
  106|  31.5k|  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|  31.5k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|  31.5k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  107|  31.5k|  const __m128i one_b = _mm_set1_epi8(1);
  108|  31.5k|  const __m128i two_w = _mm_set1_epi16(2);
  109|   224k|  for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (109:19): [True: 193k, False: 31.5k]
  ------------------
  110|   193k|    const __m128i m_i0 = xx_loadl_64(mask);
  111|   193k|    const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
  112|   193k|    const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
  113|   193k|    const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
  114|   193k|    const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
  115|   193k|    const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
  116|       |
  117|   193k|    blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
  118|   193k|                                shift);
  119|   193k|    mask += mask_stride << 1;
  120|   193k|    dst += dst_stride;
  121|   193k|    src0 += src0_stride;
  122|   193k|    src1 += src1_stride;
  123|   193k|  }
  124|  31.5k|}
blend_a64_mask_avx2.c:aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1:
  130|  30.9k|    const __m128i *round_offset, int shift) {
  131|  30.9k|  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|  30.9k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|  30.9k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  132|  30.9k|  const __m128i one_b = _mm_set1_epi8(1);
  133|  30.9k|  const __m128i two_w = _mm_set1_epi16(2);
  134|   443k|  for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (134:19): [True: 412k, False: 30.9k]
  ------------------
  135|   412k|    const __m128i m_i0 = xx_loadu_128(mask);
  136|   412k|    const __m128i m_i1 = xx_loadu_128(mask + mask_stride);
  137|   412k|    const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
  138|   412k|    const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
  139|   412k|    const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
  140|   412k|    const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
  141|       |
  142|   412k|    blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
  143|   412k|                                shift);
  144|   412k|    mask += mask_stride << 1;
  145|   412k|    dst += dst_stride;
  146|   412k|    src0 += src0_stride;
  147|   412k|    src1 += src1_stride;
  148|   412k|  }
  149|  30.9k|}
blend_a64_mask_avx2.c:aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1:
  155|     36|    const __m128i *round_offset, int shift) {
  156|     36|  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|     36|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|     36|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  157|     36|  const __m128i one_b = _mm_set1_epi8(1);
  158|     36|  const __m128i zeros = _mm_setzero_si128();
  159|    324|  for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (159:19): [True: 288, False: 36]
  ------------------
  160|    288|    const __m128i m_i0 = xx_loadl_64(mask);
  161|    288|    const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
  162|    288|    const __m128i m = _mm_avg_epu16(m_ac, zeros);
  163|       |
  164|    288|    blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
  165|    288|                                shift);
  166|    288|    mask += mask_stride;
  167|    288|    dst += dst_stride;
  168|    288|    src0 += src0_stride;
  169|    288|    src1 += src1_stride;
  170|    288|  }
  171|     36|}
blend_a64_mask_avx2.c:aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1:
  177|     50|    const __m128i *round_offset, int shift) {
  178|     50|  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|     50|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|     50|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  179|     50|  const __m128i one_b = _mm_set1_epi8(1);
  180|     50|  const __m128i zeros = _mm_setzero_si128();
  181|    690|  for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (181:19): [True: 640, False: 50]
  ------------------
  182|    640|    const __m128i m_i0 = xx_loadu_128(mask);
  183|    640|    const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
  184|    640|    const __m128i m = _mm_avg_epu16(m_ac, zeros);
  185|       |
  186|    640|    blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
  187|    640|                                shift);
  188|    640|    mask += mask_stride;
  189|    640|    dst += dst_stride;
  190|    640|    src0 += src0_stride;
  191|    640|    src1 += src1_stride;
  192|    640|  }
  193|     50|}

blend_a64_mask_sse4.c:blend_16_u8:
   88|  1.05M|                                  const __m128i *rounding) {
   89|  1.05M|  const __m128i v_s0_b = xx_loadu_128(src0);
   90|  1.05M|  const __m128i v_s1_b = xx_loadu_128(src1);
   91|       |
   92|  1.05M|  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
   93|  1.05M|                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
   94|  1.05M|  const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b),
   95|  1.05M|                                           _mm_unpackhi_epi8(*v_m0_b, *v_m1_b));
   96|       |
   97|  1.05M|  const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
   98|  1.05M|  const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding);
   99|  1.05M|  const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w);
  100|  1.05M|  return v_res;
  101|  1.05M|}
blend_a64_mask_sse4.c:blend_4_u8:
   60|  3.41M|                                 const __m128i *rounding) {
   61|  3.41M|  const __m128i v_s0_b = xx_loadl_32(src0);
   62|  3.41M|  const __m128i v_s1_b = xx_loadl_32(src1);
   63|       |
   64|  3.41M|  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
   65|  3.41M|                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
   66|       |
   67|  3.41M|  const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
   68|  3.41M|  const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
   69|  3.41M|  return v_res;
   70|  3.41M|}
blend_a64_mask_sse4.c:blend_8_u8:
   74|  2.22M|                                 const __m128i *rounding) {
   75|  2.22M|  const __m128i v_s0_b = xx_loadl_64(src0);
   76|  2.22M|  const __m128i v_s1_b = xx_loadl_64(src1);
   77|       |
   78|  2.22M|  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
   79|  2.22M|                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
   80|       |
   81|  2.22M|  const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
   82|  2.22M|  const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
   83|  2.22M|  return v_res;
   84|  2.22M|}
blend_a64_mask_sse4.c:blend_8_b10:
  122|  10.0M|                                  const __m128i v_m0_w, const __m128i v_m1_w) {
  123|  10.0M|  const __m128i v_s0_w = xx_loadu_128(src0);
  124|  10.0M|  const __m128i v_s1_w = xx_loadu_128(src1);
  125|       |
  126|  10.0M|  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
  127|  10.0M|  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
  128|       |
  129|  10.0M|  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
  130|       |
  131|  10.0M|  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
  ------------------
  |  |   23|  10.0M|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  132|       |
  133|  10.0M|  return v_res_w;
  134|  10.0M|}
blend_a64_mask_sse4.c:blend_4_b10:
  107|  3.20M|                                  const __m128i v_m0_w, const __m128i v_m1_w) {
  108|  3.20M|  const __m128i v_s0_w = xx_loadl_64(src0);
  109|  3.20M|  const __m128i v_s1_w = xx_loadl_64(src1);
  110|       |
  111|  3.20M|  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
  112|  3.20M|  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
  113|       |
  114|  3.20M|  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
  115|       |
  116|  3.20M|  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
  ------------------
  |  |   23|  3.20M|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  117|       |
  118|  3.20M|  return v_res_w;
  119|  3.20M|}
blend_a64_mask_sse4.c:blend_8_b12:
  162|   659k|                                  const __m128i v_m0_w, const __m128i v_m1_w) {
  163|   659k|  const __m128i v_s0_w = xx_loadu_128(src0);
  164|   659k|  const __m128i v_s1_w = xx_loadu_128(src1);
  165|       |
  166|       |  // Interleave
  167|   659k|  const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
  168|   659k|  const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
  169|   659k|  const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
  170|   659k|  const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
  171|       |
  172|       |  // Multiply-Add
  173|   659k|  const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
  174|   659k|  const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
  175|       |
  176|       |  // Scale
  177|   659k|  const __m128i v_ssuml_d =
  178|   659k|      _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1);
  ------------------
  |  |   23|   659k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  179|   659k|  const __m128i v_ssumh_d =
  180|   659k|      _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1);
  ------------------
  |  |   23|   659k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  181|       |
  182|       |  // Pack
  183|   659k|  const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
  184|       |
  185|       |  // Round
  186|   659k|  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
  187|       |
  188|   659k|  return v_res_w;
  189|   659k|}
blend_a64_mask_sse4.c:blend_4_b12:
  137|   151k|                                  const __m128i v_m0_w, const __m128i v_m1_w) {
  138|   151k|  const __m128i v_s0_w = xx_loadl_64(src0);
  139|   151k|  const __m128i v_s1_w = xx_loadl_64(src1);
  140|       |
  141|       |  // Interleave
  142|   151k|  const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
  143|   151k|  const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
  144|       |
  145|       |  // Multiply-Add
  146|   151k|  const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
  147|       |
  148|       |  // Scale
  149|   151k|  const __m128i v_ssum_d =
  150|   151k|      _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1);
  ------------------
  |  |   23|   151k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  151|       |
  152|       |  // Pack
  153|   151k|  const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
  154|       |
  155|       |  // Round
  156|   151k|  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
  157|       |
  158|   151k|  return v_res_w;
  159|   151k|}
blend_a64_vmask_sse4.c:blend_8:
   42|  5.31M|                              const __m128i *v_m0_w, const __m128i *v_m1_w) {
   43|  5.31M|  const __m128i v_s0_b = xx_loadl_64(src0);
   44|  5.31M|  const __m128i v_s1_b = xx_loadl_64(src1);
   45|  5.31M|  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
   46|  5.31M|  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
   47|       |
   48|  5.31M|  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
   49|  5.31M|  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
   50|       |
   51|  5.31M|  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
   52|       |
   53|  5.31M|  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
  ------------------
  |  |   23|  5.31M|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
   54|       |
   55|  5.31M|  return v_res_w;
   56|  5.31M|}
blend_a64_vmask_sse4.c:blend_4:
   27|   324k|                              const __m128i *v_m0_w, const __m128i *v_m1_w) {
   28|   324k|  const __m128i v_s0_b = xx_loadl_32(src0);
   29|   324k|  const __m128i v_s1_b = xx_loadl_32(src1);
   30|   324k|  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
   31|   324k|  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
   32|       |
   33|   324k|  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
   34|   324k|  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
   35|   324k|  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
   36|   324k|  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
  ------------------
  |  |   23|   324k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
   37|       |
   38|   324k|  return v_res_w;
   39|   324k|}
blend_a64_vmask_sse4.c:blend_8_b10:
  122|  3.98M|                                  const __m128i v_m0_w, const __m128i v_m1_w) {
  123|  3.98M|  const __m128i v_s0_w = xx_loadu_128(src0);
  124|  3.98M|  const __m128i v_s1_w = xx_loadu_128(src1);
  125|       |
  126|  3.98M|  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
  127|  3.98M|  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
  128|       |
  129|  3.98M|  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
  130|       |
  131|  3.98M|  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
  ------------------
  |  |   23|  3.98M|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  132|       |
  133|  3.98M|  return v_res_w;
  134|  3.98M|}
blend_a64_vmask_sse4.c:blend_4_b10:
  107|   222k|                                  const __m128i v_m0_w, const __m128i v_m1_w) {
  108|   222k|  const __m128i v_s0_w = xx_loadl_64(src0);
  109|   222k|  const __m128i v_s1_w = xx_loadl_64(src1);
  110|       |
  111|   222k|  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
  112|   222k|  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
  113|       |
  114|   222k|  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
  115|       |
  116|   222k|  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
  ------------------
  |  |   23|   222k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  117|       |
  118|   222k|  return v_res_w;
  119|   222k|}
blend_a64_vmask_sse4.c:blend_8_b12:
  162|   351k|                                  const __m128i v_m0_w, const __m128i v_m1_w) {
  163|   351k|  const __m128i v_s0_w = xx_loadu_128(src0);
  164|   351k|  const __m128i v_s1_w = xx_loadu_128(src1);
  165|       |
  166|       |  // Interleave
  167|   351k|  const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
  168|   351k|  const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
  169|   351k|  const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
  170|   351k|  const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
  171|       |
  172|       |  // Multiply-Add
  173|   351k|  const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
  174|   351k|  const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
  175|       |
  176|       |  // Scale
  177|   351k|  const __m128i v_ssuml_d =
  178|   351k|      _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1);
  ------------------
  |  |   23|   351k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  179|   351k|  const __m128i v_ssumh_d =
  180|   351k|      _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1);
  ------------------
  |  |   23|   351k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  181|       |
  182|       |  // Pack
  183|   351k|  const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
  184|       |
  185|       |  // Round
  186|   351k|  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
  187|       |
  188|   351k|  return v_res_w;
  189|   351k|}
blend_a64_vmask_sse4.c:blend_4_b12:
  137|  8.02k|                                  const __m128i v_m0_w, const __m128i v_m1_w) {
  138|  8.02k|  const __m128i v_s0_w = xx_loadl_64(src0);
  139|  8.02k|  const __m128i v_s1_w = xx_loadl_64(src1);
  140|       |
  141|       |  // Interleave
  142|  8.02k|  const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
  143|  8.02k|  const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
  144|       |
  145|       |  // Multiply-Add
  146|  8.02k|  const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
  147|       |
  148|       |  // Scale
  149|  8.02k|  const __m128i v_ssum_d =
  150|  8.02k|      _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1);
  ------------------
  |  |   23|  8.02k|#define AOM_BLEND_A64_ROUND_BITS 6
  ------------------
  151|       |
  152|       |  // Pack
  153|  8.02k|  const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
  154|       |
  155|       |  // Round
  156|  8.02k|  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
  157|       |
  158|  8.02k|  return v_res_w;
  159|  8.02k|}
blend_a64_mask_avx2.c:blend_4_u8:
   60|   540k|                                 const __m128i *rounding) {
   61|   540k|  const __m128i v_s0_b = xx_loadl_32(src0);
   62|   540k|  const __m128i v_s1_b = xx_loadl_32(src1);
   63|       |
   64|   540k|  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
   65|   540k|                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
   66|       |
   67|   540k|  const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
   68|   540k|  const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
   69|   540k|  return v_res;
   70|   540k|}
blend_a64_mask_avx2.c:blend_8_u8:
   74|  1.71M|                                 const __m128i *rounding) {
   75|  1.71M|  const __m128i v_s0_b = xx_loadl_64(src0);
   76|  1.71M|  const __m128i v_s1_b = xx_loadl_64(src1);
   77|       |
   78|  1.71M|  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
   79|  1.71M|                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
   80|       |
   81|  1.71M|  const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
   82|  1.71M|  const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
   83|  1.71M|  return v_res;
   84|  1.71M|}
blend_a64_mask_avx2.c:blend_16_u8:
   88|  1.44M|                                  const __m128i *rounding) {
   89|  1.44M|  const __m128i v_s0_b = xx_loadu_128(src0);
   90|  1.44M|  const __m128i v_s1_b = xx_loadu_128(src1);
   91|       |
   92|  1.44M|  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
   93|  1.44M|                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
   94|  1.44M|  const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b),
   95|  1.44M|                                           _mm_unpackhi_epi8(*v_m0_b, *v_m1_b));
   96|       |
   97|  1.44M|  const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
   98|  1.44M|  const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding);
   99|  1.44M|  const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w);
  100|  1.44M|  return v_res;
  101|  1.44M|}

highbd_convolve_avx2.c:prepare_coeffs:
  683|  1.91M|                                  __m256i *const coeffs /* [4] */) {
  684|  1.91M|  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
  685|  1.91M|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  1.91M|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  1.91M|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  686|       |
  687|  1.91M|  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
  688|  1.91M|  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
  689|       |
  690|       |  // coeffs 0 1 0 1 0 1 0 1
  691|  1.91M|  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
  692|       |  // coeffs 2 3 2 3 2 3 2 3
  693|  1.91M|  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
  694|       |  // coeffs 4 5 4 5 4 5 4 5
  695|  1.91M|  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
  696|       |  // coeffs 6 7 6 7 6 7 6 7
  697|  1.91M|  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
  698|  1.91M|}
highbd_convolve_avx2.c:convolve:
  790|  33.1M|                               const __m256i *const coeffs) {
  791|  33.1M|  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
  792|  33.1M|  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
  793|  33.1M|  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
  794|  33.1M|  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
  795|       |
  796|  33.1M|  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
  797|  33.1M|                                       _mm256_add_epi32(res_2, res_3));
  798|       |
  799|  33.1M|  return res;
  800|  33.1M|}
convolve_avx2.c:prepare_coeffs_lowbd:
  612|  25.3k|    __m256i *const coeffs /* [4] */) {
  613|  25.3k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  614|  25.3k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  25.3k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  25.3k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  615|  25.3k|  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
  616|  25.3k|  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
  617|       |
  618|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
  619|       |  // This extra right shift will be taken care of at the end while rounding
  620|       |  // the result.
  621|       |  // Since all filter co-efficients are even, this change will not affect the
  622|       |  // end result
  623|  25.3k|  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
  624|  25.3k|                            _mm_set1_epi16((short)0xffff)));
  625|       |
  626|  25.3k|  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
  627|       |
  628|       |  // coeffs 0 1 0 1 0 1 0 1
  629|  25.3k|  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
  630|       |  // coeffs 2 3 2 3 2 3 2 3
  631|  25.3k|  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
  632|       |  // coeffs 4 5 4 5 4 5 4 5
  633|  25.3k|  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
  634|       |  // coeffs 6 7 6 7 6 7 6 7
  635|  25.3k|  coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
  636|  25.3k|}
convolve_avx2.c:convolve_lowbd_4tap:
  752|   613k|                                          const __m256i *const coeffs) {
  753|   613k|  const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]);
  754|   613k|  const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]);
  755|       |
  756|       |  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
  757|   613k|  const __m256i res = _mm256_add_epi16(res_45, res_23);
  758|       |
  759|   613k|  return res;
  760|   613k|}
convolve_avx2.c:convolve_lowbd_x_4tap:
  838|   613k|                                            const __m256i *const filt) {
  839|   613k|  __m256i s[2];
  840|       |
  841|   613k|  s[0] = _mm256_shuffle_epi8(data, filt[0]);
  842|   613k|  s[1] = _mm256_shuffle_epi8(data, filt[1]);
  843|       |
  844|   613k|  return convolve_lowbd_4tap(s, coeffs);
  845|   613k|}
jnt_convolve_avx2.c:prepare_coeffs_lowbd:
  612|   470k|    __m256i *const coeffs /* [4] */) {
  613|   470k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  614|   470k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|   470k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|   470k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  615|   470k|  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
  616|   470k|  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
  617|       |
  618|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
  619|       |  // This extra right shift will be taken care of at the end while rounding
  620|       |  // the result.
  621|       |  // Since all filter co-efficients are even, this change will not affect the
  622|       |  // end result
  623|   470k|  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
  624|   470k|                            _mm_set1_epi16((short)0xffff)));
  625|       |
  626|   470k|  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
  627|       |
  628|       |  // coeffs 0 1 0 1 0 1 0 1
  629|   470k|  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
  630|       |  // coeffs 2 3 2 3 2 3 2 3
  631|   470k|  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
  632|       |  // coeffs 4 5 4 5 4 5 4 5
  633|   470k|  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
  634|       |  // coeffs 6 7 6 7 6 7 6 7
  635|   470k|  coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
  636|   470k|}
jnt_convolve_avx2.c:convolve_lowbd_x_4tap:
  838|  3.00M|                                            const __m256i *const filt) {
  839|  3.00M|  __m256i s[2];
  840|       |
  841|  3.00M|  s[0] = _mm256_shuffle_epi8(data, filt[0]);
  842|  3.00M|  s[1] = _mm256_shuffle_epi8(data, filt[1]);
  843|       |
  844|  3.00M|  return convolve_lowbd_4tap(s, coeffs);
  845|  3.00M|}
jnt_convolve_avx2.c:comp_avg:
  864|   130M|                               const int use_dist_wtd_comp_avg) {
  865|   130M|  __m256i res;
  866|   130M|  if (use_dist_wtd_comp_avg) {
  ------------------
  |  Branch (866:7): [True: 2.49M, False: 128M]
  ------------------
  867|  2.49M|    const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned);
  868|  2.49M|    const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned);
  869|       |
  870|  2.49M|    const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt);
  871|  2.49M|    const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt);
  872|       |
  873|  2.49M|    const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
  ------------------
  |  |   76|  2.49M|#define DIST_PRECISION_BITS 4
  ------------------
  874|  2.49M|    const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
  ------------------
  |  |   76|  2.49M|#define DIST_PRECISION_BITS 4
  ------------------
  875|       |
  876|  2.49M|    res = _mm256_packs_epi32(res_lo, res_hi);
  877|   128M|  } else {
  878|   128M|    const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned);
  879|   128M|    res = _mm256_srai_epi16(wt_res, 1);
  880|   128M|  }
  881|   130M|  return res;
  882|   130M|}
jnt_convolve_avx2.c:convolve_rounding:
  887|   130M|                                        const int round_shift) {
  888|   130M|  const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const);
  889|   130M|  const __m256i res_round = _mm256_srai_epi16(
  890|   130M|      _mm256_add_epi16(res_signed, *round_const), round_shift);
  891|   130M|  return res_round;
  892|   130M|}
jnt_convolve_avx2.c:convolve_lowbd_x:
  813|  6.94M|                                       const __m256i *const filt) {
  814|  6.94M|  __m256i s[4];
  815|       |
  816|  6.94M|  s[0] = _mm256_shuffle_epi8(data, filt[0]);
  817|  6.94M|  s[1] = _mm256_shuffle_epi8(data, filt[1]);
  818|  6.94M|  s[2] = _mm256_shuffle_epi8(data, filt[2]);
  819|  6.94M|  s[3] = _mm256_shuffle_epi8(data, filt[3]);
  820|       |
  821|  6.94M|  return convolve_lowbd(s, coeffs);
  822|  6.94M|}
jnt_convolve_avx2.c:convolve_lowbd_4tap:
  752|  3.31M|                                          const __m256i *const coeffs) {
  753|  3.31M|  const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]);
  754|  3.31M|  const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]);
  755|       |
  756|       |  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
  757|  3.31M|  const __m256i res = _mm256_add_epi16(res_45, res_23);
  758|       |
  759|  3.31M|  return res;
  760|  3.31M|}
jnt_convolve_avx2.c:convolve_lowbd:
  725|  7.81M|                                     const __m256i *const coeffs) {
  726|  7.81M|  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
  727|  7.81M|  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
  728|  7.81M|  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
  729|  7.81M|  const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
  730|       |
  731|       |  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
  732|  7.81M|  const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
  733|  7.81M|                                       _mm256_add_epi16(res_23, res_67));
  734|       |
  735|  7.81M|  return res;
  736|  7.81M|}
jnt_convolve_avx2.c:prepare_coeffs:
  683|   237k|                                  __m256i *const coeffs /* [4] */) {
  684|   237k|  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
  685|   237k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|   237k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|   237k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  686|       |
  687|   237k|  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
  688|   237k|  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
  689|       |
  690|       |  // coeffs 0 1 0 1 0 1 0 1
  691|   237k|  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
  692|       |  // coeffs 2 3 2 3 2 3 2 3
  693|   237k|  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
  694|       |  // coeffs 4 5 4 5 4 5 4 5
  695|   237k|  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
  696|       |  // coeffs 6 7 6 7 6 7 6 7
  697|   237k|  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
  698|   237k|}
jnt_convolve_avx2.c:convolve:
  790|  6.72M|                               const __m256i *const coeffs) {
  791|  6.72M|  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
  792|  6.72M|  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
  793|  6.72M|  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
  794|  6.72M|  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
  795|       |
  796|  6.72M|  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
  797|  6.72M|                                       _mm256_add_epi32(res_2, res_3));
  798|       |
  799|  6.72M|  return res;
  800|  6.72M|}
jnt_convolve_avx2.c:convolve_4tap:
  803|   288k|                                    const __m256i *const coeffs) {
  804|   288k|  const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]);
  805|   288k|  const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]);
  806|       |
  807|   288k|  const __m256i res = _mm256_add_epi32(res_1, res_2);
  808|   288k|  return res;
  809|   288k|}
wiener_convolve_avx2.c:convolve_lowbd_x:
  813|  16.3M|                                       const __m256i *const filt) {
  814|  16.3M|  __m256i s[4];
  815|       |
  816|  16.3M|  s[0] = _mm256_shuffle_epi8(data, filt[0]);
  817|  16.3M|  s[1] = _mm256_shuffle_epi8(data, filt[1]);
  818|  16.3M|  s[2] = _mm256_shuffle_epi8(data, filt[2]);
  819|  16.3M|  s[3] = _mm256_shuffle_epi8(data, filt[3]);
  820|       |
  821|  16.3M|  return convolve_lowbd(s, coeffs);
  822|  16.3M|}
wiener_convolve_avx2.c:convolve_lowbd:
  725|  16.3M|                                     const __m256i *const coeffs) {
  726|  16.3M|  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
  727|  16.3M|  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
  728|  16.3M|  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
  729|  16.3M|  const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
  730|       |
  731|       |  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
  732|  16.3M|  const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
  733|  16.3M|                                       _mm256_add_epi16(res_23, res_67));
  734|       |
  735|  16.3M|  return res;
  736|  16.3M|}
wiener_convolve_avx2.c:convolve:
  790|  38.9M|                               const __m256i *const coeffs) {
  791|  38.9M|  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
  792|  38.9M|  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
  793|  38.9M|  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
  794|  38.9M|  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
  795|       |
  796|  38.9M|  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
  797|  38.9M|                                       _mm256_add_epi32(res_2, res_3));
  798|       |
  799|  38.9M|  return res;
  800|  38.9M|}
highbd_convolve_2d_avx2.c:prepare_coeffs:
  683|  5.98M|                                  __m256i *const coeffs /* [4] */) {
  684|  5.98M|  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
  685|  5.98M|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  5.98M|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  5.98M|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  686|       |
  687|  5.98M|  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
  688|  5.98M|  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
  689|       |
  690|       |  // coeffs 0 1 0 1 0 1 0 1
  691|  5.98M|  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
  692|       |  // coeffs 2 3 2 3 2 3 2 3
  693|  5.98M|  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
  694|       |  // coeffs 4 5 4 5 4 5 4 5
  695|  5.98M|  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
  696|       |  // coeffs 6 7 6 7 6 7 6 7
  697|  5.98M|  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
  698|  5.98M|}
highbd_convolve_2d_avx2.c:convolve:
  790|   128M|                               const __m256i *const coeffs) {
  791|   128M|  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
  792|   128M|  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
  793|   128M|  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
  794|   128M|  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
  795|       |
  796|   128M|  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
  797|   128M|                                       _mm256_add_epi32(res_2, res_3));
  798|       |
  799|   128M|  return res;
  800|   128M|}
highbd_jnt_convolve_avx2.c:highbd_comp_avg:
  898|  24.1M|                                      const int use_dist_wtd_comp_avg) {
  899|  24.1M|  __m256i res;
  900|  24.1M|  if (use_dist_wtd_comp_avg) {
  ------------------
  |  Branch (900:7): [True: 2.65M, False: 21.5M]
  ------------------
  901|  2.65M|    const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0);
  902|  2.65M|    const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1);
  903|  2.65M|    const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res);
  904|  2.65M|    res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS);
  ------------------
  |  |   76|  2.65M|#define DIST_PRECISION_BITS 4
  ------------------
  905|  21.5M|  } else {
  906|  21.5M|    const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned);
  907|  21.5M|    res = _mm256_srai_epi32(wt_res, 1);
  908|  21.5M|  }
  909|  24.1M|  return res;
  910|  24.1M|}
highbd_jnt_convolve_avx2.c:highbd_convolve_rounding:
  914|  24.3M|    const __m256i *const round_const, const int round_shift) {
  915|  24.3M|  const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const);
  916|  24.3M|  const __m256i res_round = _mm256_srai_epi32(
  917|  24.3M|      _mm256_add_epi32(res_signed, *round_const), round_shift);
  918|       |
  919|  24.3M|  return res_round;
  920|  24.3M|}
highbd_jnt_convolve_avx2.c:prepare_coeffs:
  683|  1.96M|                                  __m256i *const coeffs /* [4] */) {
  684|  1.96M|  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
  685|  1.96M|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  1.96M|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  1.96M|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  686|       |
  687|  1.96M|  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
  688|  1.96M|  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
  689|       |
  690|       |  // coeffs 0 1 0 1 0 1 0 1
  691|  1.96M|  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
  692|       |  // coeffs 2 3 2 3 2 3 2 3
  693|  1.96M|  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
  694|       |  // coeffs 4 5 4 5 4 5 4 5
  695|  1.96M|  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
  696|       |  // coeffs 6 7 6 7 6 7 6 7
  697|  1.96M|  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
  698|  1.96M|}
highbd_jnt_convolve_avx2.c:convolve:
  790|  81.6M|                               const __m256i *const coeffs) {
  791|  81.6M|  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
  792|  81.6M|  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
  793|  81.6M|  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
  794|  81.6M|  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
  795|       |
  796|  81.6M|  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
  797|  81.6M|                                       _mm256_add_epi32(res_2, res_3));
  798|       |
  799|  81.6M|  return res;
  800|  81.6M|}

av1_highbd_convolve_y_sr_avx2:
   44|  1.02M|                                   const int subpel_y_qn, int bd) {
   45|  1.02M|  if (filter_params_y->taps == 12) {
  ------------------
  |  Branch (45:7): [True: 0, False: 1.02M]
  ------------------
   46|      0|    av1_highbd_convolve_y_sr_ssse3(src, src_stride, dst, dst_stride, w, h,
   47|      0|                                   filter_params_y, subpel_y_qn, bd);
   48|      0|    return;
   49|      0|  }
   50|  1.02M|  int i, j;
   51|  1.02M|  const int fo_vert = filter_params_y->taps / 2 - 1;
   52|  1.02M|  const uint16_t *const src_ptr = src - fo_vert * src_stride;
   53|       |
   54|  1.02M|  __m256i s[8], coeffs_y[4];
   55|       |
   56|  1.02M|  const int bits = FILTER_BITS;
  ------------------
  |  |   21|  1.02M|#define FILTER_BITS 7
  ------------------
   57|       |
   58|  1.02M|  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
   59|  1.02M|  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
   60|  1.02M|  const __m256i clip_pixel =
   61|  1.02M|      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
  ------------------
  |  Branch (61:25): [True: 1.01M, False: 2.77k]
  |  Branch (61:44): [True: 2.77k, False: 0]
  ------------------
   62|  1.02M|  const __m256i zero = _mm256_setzero_si256();
   63|       |
   64|  1.02M|  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
   65|       |
   66|  2.45M|  for (j = 0; j < w; j += 8) {
  ------------------
  |  Branch (66:15): [True: 1.42M, False: 1.02M]
  ------------------
   67|  1.42M|    const uint16_t *data = &src_ptr[j];
   68|       |    /* Vertical filter */
   69|  1.42M|    {
   70|  1.42M|      __m256i src6;
   71|  1.42M|      __m256i s01 = _mm256_permute2x128_si256(
   72|  1.42M|          _mm256_castsi128_si256(
   73|  1.42M|              _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
   74|  1.42M|          _mm256_castsi128_si256(
   75|  1.42M|              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
   76|  1.42M|          0x20);
   77|  1.42M|      __m256i s12 = _mm256_permute2x128_si256(
   78|  1.42M|          _mm256_castsi128_si256(
   79|  1.42M|              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
   80|  1.42M|          _mm256_castsi128_si256(
   81|  1.42M|              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
   82|  1.42M|          0x20);
   83|  1.42M|      __m256i s23 = _mm256_permute2x128_si256(
   84|  1.42M|          _mm256_castsi128_si256(
   85|  1.42M|              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
   86|  1.42M|          _mm256_castsi128_si256(
   87|  1.42M|              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
   88|  1.42M|          0x20);
   89|  1.42M|      __m256i s34 = _mm256_permute2x128_si256(
   90|  1.42M|          _mm256_castsi128_si256(
   91|  1.42M|              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
   92|  1.42M|          _mm256_castsi128_si256(
   93|  1.42M|              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
   94|  1.42M|          0x20);
   95|  1.42M|      __m256i s45 = _mm256_permute2x128_si256(
   96|  1.42M|          _mm256_castsi128_si256(
   97|  1.42M|              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
   98|  1.42M|          _mm256_castsi128_si256(
   99|  1.42M|              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
  100|  1.42M|          0x20);
  101|  1.42M|      src6 = _mm256_castsi128_si256(
  102|  1.42M|          _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
  103|  1.42M|      __m256i s56 = _mm256_permute2x128_si256(
  104|  1.42M|          _mm256_castsi128_si256(
  105|  1.42M|              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
  106|  1.42M|          src6, 0x20);
  107|       |
  108|  1.42M|      s[0] = _mm256_unpacklo_epi16(s01, s12);
  109|  1.42M|      s[1] = _mm256_unpacklo_epi16(s23, s34);
  110|  1.42M|      s[2] = _mm256_unpacklo_epi16(s45, s56);
  111|       |
  112|  1.42M|      s[4] = _mm256_unpackhi_epi16(s01, s12);
  113|  1.42M|      s[5] = _mm256_unpackhi_epi16(s23, s34);
  114|  1.42M|      s[6] = _mm256_unpackhi_epi16(s45, s56);
  115|       |
  116|  9.64M|      for (i = 0; i < h; i += 2) {
  ------------------
  |  Branch (116:19): [True: 8.21M, False: 1.42M]
  ------------------
  117|  8.21M|        data = &src_ptr[i * src_stride + j];
  118|       |
  119|  8.21M|        const __m256i s67 = _mm256_permute2x128_si256(
  120|  8.21M|            src6,
  121|  8.21M|            _mm256_castsi128_si256(
  122|  8.21M|                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
  123|  8.21M|            0x20);
  124|       |
  125|  8.21M|        src6 = _mm256_castsi128_si256(
  126|  8.21M|            _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
  127|       |
  128|  8.21M|        const __m256i s78 = _mm256_permute2x128_si256(
  129|  8.21M|            _mm256_castsi128_si256(
  130|  8.21M|                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
  131|  8.21M|            src6, 0x20);
  132|       |
  133|  8.21M|        s[3] = _mm256_unpacklo_epi16(s67, s78);
  134|  8.21M|        s[7] = _mm256_unpackhi_epi16(s67, s78);
  135|       |
  136|  8.21M|        const __m256i res_a = convolve(s, coeffs_y);
  137|       |
  138|  8.21M|        __m256i res_a_round = _mm256_sra_epi32(
  139|  8.21M|            _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
  140|       |
  141|  8.21M|        if (w - j > 4) {
  ------------------
  |  Branch (141:13): [True: 6.84M, False: 1.37M]
  ------------------
  142|  6.84M|          const __m256i res_b = convolve(s + 4, coeffs_y);
  143|  6.84M|          __m256i res_b_round = _mm256_sra_epi32(
  144|  6.84M|              _mm256_add_epi32(res_b, round_const_bits), round_shift_bits);
  145|       |
  146|  6.84M|          __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
  147|  6.84M|          res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
  148|  6.84M|          res_16bit = _mm256_max_epi16(res_16bit, zero);
  149|       |
  150|  6.84M|          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
  151|  6.84M|                           _mm256_castsi256_si128(res_16bit));
  152|  6.84M|          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
  153|  6.84M|                           _mm256_extracti128_si256(res_16bit, 1));
  154|  6.84M|        } else if (w == 4) {
  ------------------
  |  Branch (154:20): [True: 1.11M, False: 263k]
  ------------------
  155|  1.11M|          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
  156|  1.11M|          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
  157|  1.11M|          res_a_round = _mm256_max_epi16(res_a_round, zero);
  158|       |
  159|  1.11M|          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
  160|  1.11M|                           _mm256_castsi256_si128(res_a_round));
  161|  1.11M|          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
  162|  1.11M|                           _mm256_extracti128_si256(res_a_round, 1));
  163|  1.11M|        } else {
  164|   263k|          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
  165|   263k|          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
  166|   263k|          res_a_round = _mm256_max_epi16(res_a_round, zero);
  167|       |
  168|   263k|          xx_storel_32(&dst[i * dst_stride + j],
  169|   263k|                       _mm256_castsi256_si128(res_a_round));
  170|   263k|          xx_storel_32(&dst[i * dst_stride + j + dst_stride],
  171|   263k|                       _mm256_extracti128_si256(res_a_round, 1));
  172|   263k|        }
  173|       |
  174|  8.21M|        s[0] = s[1];
  175|  8.21M|        s[1] = s[2];
  176|  8.21M|        s[2] = s[3];
  177|       |
  178|  8.21M|        s[4] = s[5];
  179|  8.21M|        s[5] = s[6];
  180|  8.21M|        s[6] = s[7];
  181|  8.21M|      }
  182|  1.42M|    }
  183|  1.42M|  }
  184|  1.02M|}
av1_highbd_convolve_x_sr_avx2:
  190|   890k|                                   ConvolveParams *conv_params, int bd) {
  191|   890k|  if (filter_params_x->taps == 12) {
  ------------------
  |  Branch (191:7): [True: 0, False: 890k]
  ------------------
  192|      0|    av1_highbd_convolve_x_sr_ssse3(src, src_stride, dst, dst_stride, w, h,
  193|      0|                                   filter_params_x, subpel_x_qn, conv_params,
  194|      0|                                   bd);
  195|      0|    return;
  196|      0|  }
  197|   890k|  int i, j;
  198|   890k|  const int fo_horiz = filter_params_x->taps / 2 - 1;
  199|   890k|  const uint16_t *const src_ptr = src - fo_horiz;
  200|       |
  201|       |  // Check that, even with 12-bit input, the intermediate values will fit
  202|       |  // into an unsigned 16-bit intermediate array.
  203|   890k|  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
  204|       |
  205|   890k|  __m256i s[4], coeffs_x[4];
  206|       |
  207|   890k|  const __m256i round_const_x =
  208|   890k|      _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
  209|   890k|  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
  210|       |
  211|   890k|  const int bits = FILTER_BITS - conv_params->round_0;
  ------------------
  |  |   21|   890k|#define FILTER_BITS 7
  ------------------
  212|   890k|  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
  213|   890k|  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
  214|   890k|  const __m256i clip_pixel =
  215|   890k|      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
  ------------------
  |  Branch (215:25): [True: 889k, False: 1.81k]
  |  Branch (215:44): [True: 1.81k, False: 0]
  ------------------
  216|   890k|  const __m256i zero = _mm256_setzero_si256();
  217|       |
  218|   890k|  assert(bits >= 0);
  219|   890k|  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
  220|   890k|         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
  221|       |
  222|   890k|  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
  223|       |
  224|  2.20M|  for (j = 0; j < w; j += 8) {
  ------------------
  |  Branch (224:15): [True: 1.31M, False: 890k]
  ------------------
  225|       |    /* Horizontal filter */
  226|  10.4M|    for (i = 0; i < h; i += 2) {
  ------------------
  |  Branch (226:17): [True: 9.10M, False: 1.31M]
  ------------------
  227|  9.10M|      const __m256i row0 =
  228|  9.10M|          _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
  229|  9.10M|      __m256i row1 =
  230|  9.10M|          _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
  231|       |
  232|  9.10M|      const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
  233|  9.10M|      const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
  234|       |
  235|       |      // even pixels
  236|  9.10M|      s[0] = _mm256_alignr_epi8(r1, r0, 0);
  237|  9.10M|      s[1] = _mm256_alignr_epi8(r1, r0, 4);
  238|  9.10M|      s[2] = _mm256_alignr_epi8(r1, r0, 8);
  239|  9.10M|      s[3] = _mm256_alignr_epi8(r1, r0, 12);
  240|       |
  241|  9.10M|      __m256i res_even = convolve(s, coeffs_x);
  242|  9.10M|      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
  243|  9.10M|                                  round_shift_x);
  244|       |
  245|       |      // odd pixels
  246|  9.10M|      s[0] = _mm256_alignr_epi8(r1, r0, 2);
  247|  9.10M|      s[1] = _mm256_alignr_epi8(r1, r0, 6);
  248|  9.10M|      s[2] = _mm256_alignr_epi8(r1, r0, 10);
  249|  9.10M|      s[3] = _mm256_alignr_epi8(r1, r0, 14);
  250|       |
  251|  9.10M|      __m256i res_odd = convolve(s, coeffs_x);
  252|  9.10M|      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
  253|  9.10M|                                 round_shift_x);
  254|       |
  255|  9.10M|      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_bits),
  256|  9.10M|                                  round_shift_bits);
  257|  9.10M|      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_bits),
  258|  9.10M|                                 round_shift_bits);
  259|       |
  260|  9.10M|      __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
  261|  9.10M|      __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
  262|       |
  263|  9.10M|      __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
  264|  9.10M|      res = _mm256_min_epi16(res, clip_pixel);
  265|  9.10M|      res = _mm256_max_epi16(res, zero);
  266|       |
  267|  9.10M|      if (w - j > 4) {
  ------------------
  |  Branch (267:11): [True: 7.94M, False: 1.15M]
  ------------------
  268|  7.94M|        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
  269|  7.94M|                         _mm256_castsi256_si128(res));
  270|  7.94M|        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
  271|  7.94M|                         _mm256_extracti128_si256(res, 1));
  272|  7.94M|      } else if (w == 4) {
  ------------------
  |  Branch (272:18): [True: 929k, False: 229k]
  ------------------
  273|   929k|        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
  274|   929k|                         _mm256_castsi256_si128(res));
  275|   929k|        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
  276|   929k|                         _mm256_extracti128_si256(res, 1));
  277|   929k|      } else {
  278|   229k|        xx_storel_32(&dst[i * dst_stride + j], _mm256_castsi256_si128(res));
  279|   229k|        xx_storel_32(&dst[i * dst_stride + j + dst_stride],
  280|   229k|                     _mm256_extracti128_si256(res, 1));
  281|   229k|      }
  282|  9.10M|    }
  283|  1.31M|  }
  284|   890k|}

aom_highbd_h_predictor_4x4_sse2:
   21|   286k|                                     const uint16_t *left, int bd) {
   22|   286k|  const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left);
   23|   286k|  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
   24|   286k|  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
   25|   286k|  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
   26|   286k|  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
   27|   286k|  (void)above;
   28|   286k|  (void)bd;
   29|   286k|  _mm_storel_epi64((__m128i *)dst, row0);
   30|   286k|  dst += stride;
   31|   286k|  _mm_storel_epi64((__m128i *)dst, row1);
   32|   286k|  dst += stride;
   33|   286k|  _mm_storel_epi64((__m128i *)dst, row2);
   34|   286k|  dst += stride;
   35|   286k|  _mm_storel_epi64((__m128i *)dst, row3);
   36|   286k|}
aom_highbd_h_predictor_4x8_sse2:
   40|  53.2k|                                     const uint16_t *left, int bd) {
   41|  53.2k|  aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
   42|  53.2k|  dst += stride << 2;
   43|  53.2k|  left += 4;
   44|  53.2k|  aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
   45|  53.2k|}
aom_highbd_h_predictor_8x4_sse2:
   49|   133k|                                     const uint16_t *left, int bd) {
   50|   133k|  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
   51|   133k|  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
   52|   133k|  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
   53|   133k|  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
   54|   133k|  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
   55|   133k|  (void)above;
   56|   133k|  (void)bd;
   57|   133k|  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
   58|   133k|  dst += stride;
   59|   133k|  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
   60|   133k|  dst += stride;
   61|   133k|  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
   62|   133k|  dst += stride;
   63|   133k|  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
   64|   133k|}
aom_highbd_h_predictor_8x8_sse2:
   68|   400k|                                     const uint16_t *left, int bd) {
   69|   400k|  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
   70|   400k|  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
   71|   400k|  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
   72|   400k|  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
   73|   400k|  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
   74|   400k|  const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
   75|   400k|  const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
   76|   400k|  const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
   77|   400k|  const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
   78|   400k|  (void)above;
   79|   400k|  (void)bd;
   80|   400k|  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
   81|   400k|  dst += stride;
   82|   400k|  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
   83|   400k|  dst += stride;
   84|   400k|  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
   85|   400k|  dst += stride;
   86|   400k|  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
   87|   400k|  dst += stride;
   88|   400k|  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4));
   89|   400k|  dst += stride;
   90|   400k|  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5));
   91|   400k|  dst += stride;
   92|   400k|  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6));
   93|   400k|  dst += stride;
   94|   400k|  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7));
   95|   400k|}
aom_highbd_h_predictor_8x16_sse2:
   99|  57.2k|                                      const uint16_t *left, int bd) {
  100|  57.2k|  aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
  101|  57.2k|  dst += stride << 3;
  102|  57.2k|  left += 8;
  103|  57.2k|  aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
  104|  57.2k|}
aom_highbd_h_predictor_16x8_sse2:
  145|   113k|                                      const uint16_t *left, int bd) {
  146|   113k|  (void)above;
  147|   113k|  (void)bd;
  148|   113k|  h_predictor_16x8(dst, stride, left);
  149|   113k|}
aom_highbd_h_predictor_16x16_sse2:
  153|   284k|                                       const uint16_t *left, int bd) {
  154|   284k|  int i;
  155|   284k|  (void)above;
  156|   284k|  (void)bd;
  157|       |
  158|   853k|  for (i = 0; i < 2; i++, left += 8) {
  ------------------
  |  Branch (158:15): [True: 569k, False: 284k]
  ------------------
  159|   569k|    h_predictor_16x8(dst, stride, left);
  160|   569k|    dst += stride << 3;
  161|   569k|  }
  162|   284k|}
aom_highbd_h_predictor_16x32_sse2:
  166|  26.1k|                                       const uint16_t *left, int bd) {
  167|  26.1k|  int i;
  168|  26.1k|  (void)above;
  169|  26.1k|  (void)bd;
  170|       |
  171|   130k|  for (i = 0; i < 4; i++, left += 8) {
  ------------------
  |  Branch (171:15): [True: 104k, False: 26.1k]
  ------------------
  172|   104k|    h_predictor_16x8(dst, stride, left);
  173|   104k|    dst += stride << 3;
  174|   104k|  }
  175|  26.1k|}
aom_highbd_h_predictor_32x16_sse2:
  220|  67.7k|                                       const uint16_t *left, int bd) {
  221|  67.7k|  int i;
  222|  67.7k|  (void)above;
  223|  67.7k|  (void)bd;
  224|       |
  225|   203k|  for (i = 0; i < 2; i++, left += 8) {
  ------------------
  |  Branch (225:15): [True: 135k, False: 67.7k]
  ------------------
  226|   135k|    h_predictor_32x8(dst, stride, left);
  227|   135k|    dst += stride << 3;
  228|   135k|  }
  229|  67.7k|}
aom_highbd_h_predictor_32x32_sse2:
  233|   165k|                                       const uint16_t *left, int bd) {
  234|   165k|  int i;
  235|   165k|  (void)above;
  236|   165k|  (void)bd;
  237|       |
  238|   828k|  for (i = 0; i < 4; i++, left += 8) {
  ------------------
  |  Branch (238:15): [True: 662k, False: 165k]
  ------------------
  239|   662k|    h_predictor_32x8(dst, stride, left);
  240|   662k|    dst += stride << 3;
  241|   662k|  }
  242|   165k|}
aom_highbd_dc_left_predictor_4x4_sse2:
  267|  94.5k|                                           const uint16_t *left, int bd) {
  268|  94.5k|  const __m128i two = _mm_cvtsi32_si128(2);
  269|  94.5k|  const __m128i sum = dc_sum_4(left);
  270|  94.5k|  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
  271|  94.5k|  (void)above;
  272|  94.5k|  (void)bd;
  273|  94.5k|  dc_store_4x4(dst, stride, &dc);
  274|  94.5k|}
aom_highbd_dc_top_predictor_4x4_sse2:
  278|  1.32M|                                          const uint16_t *left, int bd) {
  279|  1.32M|  const __m128i two = _mm_cvtsi32_si128(2);
  280|  1.32M|  const __m128i sum = dc_sum_4(above);
  281|  1.32M|  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
  282|  1.32M|  (void)left;
  283|  1.32M|  (void)bd;
  284|  1.32M|  dc_store_4x4(dst, stride, &dc);
  285|  1.32M|}
aom_highbd_dc_128_predictor_4x4_sse2:
  289|  6.56k|                                          const uint16_t *left, int bd) {
  290|  6.56k|  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
  291|  6.56k|  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
  292|  6.56k|  (void)above;
  293|  6.56k|  (void)left;
  294|  6.56k|  dc_store_4x4(dst, stride, &dc_dup);
  295|  6.56k|}
aom_highbd_dc_left_predictor_4x8_sse2:
  321|  3.16k|                                           const uint16_t *left, int bd) {
  322|  3.16k|  const __m128i sum = dc_sum_8(left);
  323|  3.16k|  const __m128i four = _mm_cvtsi32_si128(4);
  324|  3.16k|  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
  325|  3.16k|  (void)above;
  326|  3.16k|  (void)bd;
  327|  3.16k|  dc_store_4x8(dst, stride, &dc);
  328|  3.16k|}
aom_highbd_dc_top_predictor_4x8_sse2:
  332|  16.6k|                                          const uint16_t *left, int bd) {
  333|  16.6k|  const __m128i two = _mm_cvtsi32_si128(2);
  334|  16.6k|  const __m128i sum = dc_sum_4(above);
  335|  16.6k|  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
  336|  16.6k|  (void)left;
  337|  16.6k|  (void)bd;
  338|  16.6k|  dc_store_4x8(dst, stride, &dc);
  339|  16.6k|}
aom_highbd_dc_128_predictor_4x8_sse2:
  343|    570|                                          const uint16_t *left, int bd) {
  344|    570|  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
  345|    570|  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
  346|    570|  (void)above;
  347|    570|  (void)left;
  348|    570|  dc_store_4x8(dst, stride, &dc_dup);
  349|    570|}
aom_highbd_dc_top_predictor_8x4_sse2:
  377|  26.1k|                                          const uint16_t *left, int bd) {
  378|  26.1k|  (void)left;
  379|  26.1k|  (void)bd;
  380|  26.1k|  dc_top_predictor_8xh(dst, stride, 4, above);
  381|  26.1k|}
aom_highbd_dc_top_predictor_8x8_sse2:
  385|  31.2k|                                          const uint16_t *left, int bd) {
  386|  31.2k|  (void)left;
  387|  31.2k|  (void)bd;
  388|  31.2k|  dc_top_predictor_8xh(dst, stride, 8, above);
  389|  31.2k|}
aom_highbd_dc_top_predictor_8x16_sse2:
  393|  8.00k|                                           const uint16_t *left, int bd) {
  394|  8.00k|  (void)left;
  395|  8.00k|  (void)bd;
  396|  8.00k|  dc_top_predictor_8xh(dst, stride, 16, above);
  397|  8.00k|}
aom_highbd_dc_left_predictor_8x4_sse2:
  404|  6.39k|                                           const uint16_t *left, int bd) {
  405|  6.39k|  const __m128i two = _mm_cvtsi32_si128(2);
  406|  6.39k|  const __m128i sum = dc_sum_4(left);
  407|  6.39k|  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
  408|  6.39k|  (void)above;
  409|  6.39k|  (void)bd;
  410|  6.39k|  dc_store_8xh(dst, stride, 4, &dc);
  411|  6.39k|}
aom_highbd_dc_left_predictor_8x8_sse2:
  415|  43.5k|                                           const uint16_t *left, int bd) {
  416|  43.5k|  const __m128i four = _mm_cvtsi32_si128(4);
  417|  43.5k|  const __m128i sum = dc_sum_8(left);
  418|  43.5k|  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
  419|  43.5k|  (void)above;
  420|  43.5k|  (void)bd;
  421|  43.5k|  dc_store_8xh(dst, stride, 8, &dc);
  422|  43.5k|}
aom_highbd_dc_left_predictor_8x16_sse2:
  433|  5.58k|                                            const uint16_t *left, int bd) {
  434|  5.58k|  const __m128i eight = _mm_cvtsi32_si128(8);
  435|  5.58k|  const __m128i sum = dc_sum_16(left);
  436|  5.58k|  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
  437|  5.58k|  (void)above;
  438|  5.58k|  (void)bd;
  439|  5.58k|  dc_store_8xh(dst, stride, 16, &dc);
  440|  5.58k|}
aom_highbd_dc_128_predictor_8x4_sse2:
  454|     95|                                          const uint16_t *left, int bd) {
  455|     95|  (void)above;
  456|     95|  (void)left;
  457|     95|  dc_128_predictor_8xh(dst, stride, 4, bd);
  458|     95|}
aom_highbd_dc_128_predictor_8x8_sse2:
  462|  1.67k|                                          const uint16_t *left, int bd) {
  463|  1.67k|  (void)above;
  464|  1.67k|  (void)left;
  465|  1.67k|  dc_128_predictor_8xh(dst, stride, 8, bd);
  466|  1.67k|}
aom_highbd_dc_128_predictor_8x16_sse2:
  470|  1.16k|                                           const uint16_t *left, int bd) {
  471|  1.16k|  (void)above;
  472|  1.16k|  (void)left;
  473|  1.16k|  dc_128_predictor_8xh(dst, stride, 16, bd);
  474|  1.16k|}
aom_highbd_dc_left_predictor_16x8_sse2:
  495|  13.3k|                                            const uint16_t *left, int bd) {
  496|  13.3k|  const __m128i four = _mm_cvtsi32_si128(4);
  497|  13.3k|  const __m128i sum = dc_sum_8(left);
  498|  13.3k|  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
  499|  13.3k|  (void)above;
  500|  13.3k|  (void)bd;
  501|  13.3k|  dc_store_16xh(dst, stride, 8, &dc);
  502|  13.3k|}
aom_highbd_dc_left_predictor_16x16_sse2:
  506|  43.5k|                                             const uint16_t *left, int bd) {
  507|  43.5k|  const __m128i eight = _mm_cvtsi32_si128(8);
  508|  43.5k|  const __m128i sum = dc_sum_16(left);
  509|  43.5k|  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
  510|  43.5k|  (void)above;
  511|  43.5k|  (void)bd;
  512|  43.5k|  dc_store_16xh(dst, stride, 16, &dc);
  513|  43.5k|}
aom_highbd_dc_left_predictor_16x32_sse2:
  527|  8.01k|                                             const uint16_t *left, int bd) {
  528|  8.01k|  const __m128i sixteen = _mm_cvtsi32_si128(16);
  529|  8.01k|  const __m128i sum = dc_sum_32(left);
  530|  8.01k|  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
  531|  8.01k|  (void)above;
  532|  8.01k|  (void)bd;
  533|  8.01k|  dc_store_16xh(dst, stride, 32, &dc);
  534|  8.01k|}
aom_highbd_dc_top_predictor_16x8_sse2:
  541|  8.25k|                                           const uint16_t *left, int bd) {
  542|  8.25k|  const __m128i eight = _mm_cvtsi32_si128(8);
  543|  8.25k|  const __m128i sum = dc_sum_16(above);
  544|  8.25k|  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
  545|  8.25k|  (void)left;
  546|  8.25k|  (void)bd;
  547|  8.25k|  dc_store_16xh(dst, stride, 8, &dc);
  548|  8.25k|}
aom_highbd_dc_top_predictor_16x16_sse2:
  552|  14.3k|                                            const uint16_t *left, int bd) {
  553|  14.3k|  const __m128i eight = _mm_cvtsi32_si128(8);
  554|  14.3k|  const __m128i sum = dc_sum_16(above);
  555|  14.3k|  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
  556|  14.3k|  (void)left;
  557|  14.3k|  (void)bd;
  558|  14.3k|  dc_store_16xh(dst, stride, 16, &dc);
  559|  14.3k|}
aom_highbd_dc_top_predictor_16x32_sse2:
  563|  12.9k|                                            const uint16_t *left, int bd) {
  564|  12.9k|  const __m128i eight = _mm_cvtsi32_si128(8);
  565|  12.9k|  const __m128i sum = dc_sum_16(above);
  566|  12.9k|  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
  567|  12.9k|  (void)left;
  568|  12.9k|  (void)bd;
  569|  12.9k|  dc_store_16xh(dst, stride, 32, &dc);
  570|  12.9k|}
aom_highbd_dc_128_predictor_16x8_sse2:
  577|    104|                                           const uint16_t *left, int bd) {
  578|    104|  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
  579|    104|  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
  580|    104|  (void)above;
  581|    104|  (void)left;
  582|    104|  dc_store_16xh(dst, stride, 8, &dc_dup);
  583|    104|}
aom_highbd_dc_128_predictor_16x16_sse2:
  587|  2.68k|                                            const uint16_t *left, int bd) {
  588|  2.68k|  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
  589|  2.68k|  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
  590|  2.68k|  (void)above;
  591|  2.68k|  (void)left;
  592|  2.68k|  dc_store_16xh(dst, stride, 16, &dc_dup);
  593|  2.68k|}
aom_highbd_dc_128_predictor_16x32_sse2:
  597|  3.70k|                                            const uint16_t *left, int bd) {
  598|  3.70k|  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
  599|  3.70k|  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
  600|  3.70k|  (void)above;
  601|  3.70k|  (void)left;
  602|  3.70k|  dc_store_16xh(dst, stride, 32, &dc_dup);
  603|  3.70k|}
aom_highbd_dc_left_predictor_32x16_sse2:
  623|  10.4k|                                             const uint16_t *left, int bd) {
  624|  10.4k|  const __m128i eight = _mm_cvtsi32_si128(8);
  625|  10.4k|  const __m128i sum = dc_sum_16(left);
  626|  10.4k|  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
  627|  10.4k|  (void)above;
  628|  10.4k|  (void)bd;
  629|  10.4k|  dc_store_32xh(dst, stride, 16, &dc);
  630|  10.4k|}
aom_highbd_dc_left_predictor_32x32_sse2:
  634|   113k|                                             const uint16_t *left, int bd) {
  635|   113k|  const __m128i sixteen = _mm_cvtsi32_si128(16);
  636|   113k|  const __m128i sum = dc_sum_32(left);
  637|   113k|  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
  638|   113k|  (void)above;
  639|   113k|  (void)bd;
  640|   113k|  dc_store_32xh(dst, stride, 32, &dc);
  641|   113k|}
aom_highbd_dc_top_predictor_32x16_sse2:
  645|  16.8k|                                            const uint16_t *left, int bd) {
  646|  16.8k|  const __m128i sixteen = _mm_cvtsi32_si128(16);
  647|  16.8k|  const __m128i sum = dc_sum_32(above);
  648|  16.8k|  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
  649|  16.8k|  (void)left;
  650|  16.8k|  (void)bd;
  651|  16.8k|  dc_store_32xh(dst, stride, 16, &dc);
  652|  16.8k|}
aom_highbd_dc_128_predictor_32x16_sse2:
  656|  16.5k|                                            const uint16_t *left, int bd) {
  657|  16.5k|  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
  658|  16.5k|  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
  659|  16.5k|  (void)above;
  660|  16.5k|  (void)left;
  661|  16.5k|  dc_store_32xh(dst, stride, 16, &dc_dup);
  662|  16.5k|}
aom_highbd_dc_top_predictor_32x32_sse2:
  666|  67.7k|                                            const uint16_t *left, int bd) {
  667|  67.7k|  const __m128i sixteen = _mm_cvtsi32_si128(16);
  668|  67.7k|  const __m128i sum = dc_sum_32(above);
  669|  67.7k|  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
  670|  67.7k|  (void)left;
  671|  67.7k|  (void)bd;
  672|  67.7k|  dc_store_32xh(dst, stride, 32, &dc);
  673|  67.7k|}
aom_highbd_dc_128_predictor_32x32_sse2:
  677|  22.0k|                                            const uint16_t *left, int bd) {
  678|  22.0k|  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
  679|  22.0k|  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
  680|  22.0k|  (void)above;
  681|  22.0k|  (void)left;
  682|  22.0k|  dc_store_32xh(dst, stride, 32, &dc_dup);
  683|  22.0k|}
aom_highbd_v_predictor_4x8_sse2:
  690|  35.8k|                                     const uint16_t *left, int bd) {
  691|  35.8k|  (void)left;
  692|  35.8k|  (void)bd;
  693|  35.8k|  const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above);
  694|  35.8k|  int i;
  695|   107k|  for (i = 0; i < 2; ++i) {
  ------------------
  |  Branch (695:15): [True: 71.6k, False: 35.8k]
  ------------------
  696|  71.6k|    _mm_storel_epi64((__m128i *)dst, above_u16);
  697|  71.6k|    _mm_storel_epi64((__m128i *)(dst + stride), above_u16);
  698|  71.6k|    _mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16);
  699|  71.6k|    _mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16);
  700|  71.6k|    dst += stride << 2;
  701|  71.6k|  }
  702|  35.8k|}
aom_highbd_v_predictor_8x4_sse2:
  706|  65.7k|                                     const uint16_t *left, int bd) {
  707|  65.7k|  (void)left;
  708|  65.7k|  (void)bd;
  709|  65.7k|  const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
  710|  65.7k|  _mm_store_si128((__m128i *)dst, above_u16);
  711|  65.7k|  _mm_store_si128((__m128i *)(dst + stride), above_u16);
  712|  65.7k|  _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
  713|  65.7k|  _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
  714|  65.7k|}
aom_highbd_v_predictor_8x16_sse2:
  718|  27.5k|                                      const uint16_t *left, int bd) {
  719|  27.5k|  (void)left;
  720|  27.5k|  (void)bd;
  721|  27.5k|  const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
  722|  27.5k|  int i;
  723|   137k|  for (i = 0; i < 4; ++i) {
  ------------------
  |  Branch (723:15): [True: 110k, False: 27.5k]
  ------------------
  724|   110k|    _mm_store_si128((__m128i *)dst, above_u16);
  725|   110k|    _mm_store_si128((__m128i *)(dst + stride), above_u16);
  726|   110k|    _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
  727|   110k|    _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
  728|   110k|    dst += stride << 2;
  729|   110k|  }
  730|  27.5k|}
aom_highbd_v_predictor_16x8_sse2:
  734|  65.4k|                                      const uint16_t *left, int bd) {
  735|  65.4k|  (void)left;
  736|  65.4k|  (void)bd;
  737|  65.4k|  const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
  738|  65.4k|  const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
  739|  65.4k|  int i;
  740|   196k|  for (i = 0; i < 2; ++i) {
  ------------------
  |  Branch (740:15): [True: 130k, False: 65.4k]
  ------------------
  741|   130k|    _mm_store_si128((__m128i *)dst, above0_u16);
  742|   130k|    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
  743|   130k|    dst += stride;
  744|   130k|    _mm_store_si128((__m128i *)dst, above0_u16);
  745|   130k|    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
  746|   130k|    dst += stride;
  747|   130k|    _mm_store_si128((__m128i *)dst, above0_u16);
  748|   130k|    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
  749|   130k|    dst += stride;
  750|   130k|    _mm_store_si128((__m128i *)dst, above0_u16);
  751|   130k|    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
  752|   130k|    dst += stride;
  753|   130k|  }
  754|  65.4k|}
aom_highbd_v_predictor_16x32_sse2:
  758|  12.6k|                                       const uint16_t *left, int bd) {
  759|  12.6k|  (void)left;
  760|  12.6k|  (void)bd;
  761|  12.6k|  const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
  762|  12.6k|  const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
  763|  12.6k|  int i;
  764|   113k|  for (i = 0; i < 8; ++i) {
  ------------------
  |  Branch (764:15): [True: 100k, False: 12.6k]
  ------------------
  765|   100k|    _mm_store_si128((__m128i *)dst, above0_u16);
  766|   100k|    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
  767|   100k|    dst += stride;
  768|   100k|    _mm_store_si128((__m128i *)dst, above0_u16);
  769|   100k|    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
  770|   100k|    dst += stride;
  771|   100k|    _mm_store_si128((__m128i *)dst, above0_u16);
  772|   100k|    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
  773|   100k|    dst += stride;
  774|   100k|    _mm_store_si128((__m128i *)dst, above0_u16);
  775|   100k|    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
  776|   100k|    dst += stride;
  777|   100k|  }
  778|  12.6k|}
aom_highbd_v_predictor_32x16_sse2:
  782|  10.7k|                                       const uint16_t *left, int bd) {
  783|  10.7k|  (void)left;
  784|  10.7k|  (void)bd;
  785|  10.7k|  const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
  786|  10.7k|  const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
  787|  10.7k|  const __m128i above2_u16 = _mm_load_si128((const __m128i *)(above + 16));
  788|  10.7k|  const __m128i above3_u16 = _mm_load_si128((const __m128i *)(above + 24));
  789|  10.7k|  int i;
  790|  53.7k|  for (i = 0; i < 4; ++i) {
  ------------------
  |  Branch (790:15): [True: 42.9k, False: 10.7k]
  ------------------
  791|  42.9k|    _mm_store_si128((__m128i *)dst, above0_u16);
  792|  42.9k|    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
  793|  42.9k|    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
  794|  42.9k|    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
  795|  42.9k|    dst += stride;
  796|  42.9k|    _mm_store_si128((__m128i *)dst, above0_u16);
  797|  42.9k|    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
  798|  42.9k|    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
  799|  42.9k|    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
  800|  42.9k|    dst += stride;
  801|  42.9k|    _mm_store_si128((__m128i *)dst, above0_u16);
  802|  42.9k|    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
  803|  42.9k|    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
  804|  42.9k|    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
  805|  42.9k|    dst += stride;
  806|  42.9k|    _mm_store_si128((__m128i *)dst, above0_u16);
  807|  42.9k|    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
  808|  42.9k|    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
  809|  42.9k|    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
  810|  42.9k|    dst += stride;
  811|  42.9k|  }
  812|  10.7k|}
aom_highbd_dc_predictor_4x8_sse2:
  819|   316k|                                      const uint16_t *left, int bd) {
  820|   316k|  (void)bd;
  821|   316k|  const __m128i sum_above = dc_sum_4(above);
  822|   316k|  const __m128i sum_left = dc_sum_8(left);
  823|   316k|  const __m128i sum = _mm_add_epi16(sum_above, sum_left);
  824|   316k|  uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
  825|   316k|  sum32 >>= 16;
  826|   316k|  sum32 += 6;
  827|   316k|  sum32 /= 12;
  828|   316k|  const __m128i row = _mm_set1_epi16((int16_t)sum32);
  829|   316k|  int i;
  830|  1.58M|  for (i = 0; i < 4; ++i) {
  ------------------
  |  Branch (830:15): [True: 1.26M, False: 316k]
  ------------------
  831|  1.26M|    _mm_storel_epi64((__m128i *)dst, row);
  832|  1.26M|    dst += stride;
  833|  1.26M|    _mm_storel_epi64((__m128i *)dst, row);
  834|  1.26M|    dst += stride;
  835|  1.26M|  }
  836|   316k|}
aom_highbd_dc_predictor_8x4_sse2:
  840|   598k|                                      const uint16_t *left, int bd) {
  841|   598k|  (void)bd;
  842|   598k|  const __m128i sum_left = dc_sum_4(left);
  843|   598k|  const __m128i sum_above = dc_sum_8(above);
  844|   598k|  const __m128i sum = _mm_add_epi16(sum_above, sum_left);
  845|   598k|  uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
  846|   598k|  sum32 >>= 16;
  847|   598k|  sum32 += 6;
  848|   598k|  sum32 /= 12;
  849|   598k|  const __m128i row = _mm_set1_epi16((int16_t)sum32);
  850|       |
  851|   598k|  _mm_store_si128((__m128i *)dst, row);
  852|   598k|  dst += stride;
  853|   598k|  _mm_store_si128((__m128i *)dst, row);
  854|   598k|  dst += stride;
  855|   598k|  _mm_store_si128((__m128i *)dst, row);
  856|   598k|  dst += stride;
  857|   598k|  _mm_store_si128((__m128i *)dst, row);
  858|   598k|}
aom_highbd_dc_predictor_8x16_sse2:
  862|   324k|                                       const uint16_t *left, int bd) {
  863|   324k|  (void)bd;
  864|   324k|  __m128i sum_left = dc_sum_16(left);
  865|   324k|  __m128i sum_above = dc_sum_8(above);
  866|   324k|  const __m128i zero = _mm_setzero_si128();
  867|   324k|  sum_left = _mm_unpacklo_epi16(sum_left, zero);
  868|   324k|  sum_above = _mm_unpacklo_epi16(sum_above, zero);
  869|   324k|  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
  870|   324k|  uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
  871|   324k|  sum32 += 12;
  872|   324k|  sum32 /= 24;
  873|   324k|  const __m128i row = _mm_set1_epi16((int16_t)sum32);
  874|   324k|  int i;
  875|  1.62M|  for (i = 0; i < 4; ++i) {
  ------------------
  |  Branch (875:15): [True: 1.29M, False: 324k]
  ------------------
  876|  1.29M|    _mm_store_si128((__m128i *)dst, row);
  877|  1.29M|    dst += stride;
  878|  1.29M|    _mm_store_si128((__m128i *)dst, row);
  879|  1.29M|    dst += stride;
  880|  1.29M|    _mm_store_si128((__m128i *)dst, row);
  881|  1.29M|    dst += stride;
  882|  1.29M|    _mm_store_si128((__m128i *)dst, row);
  883|  1.29M|    dst += stride;
  884|  1.29M|  }
  885|   324k|}
aom_highbd_dc_predictor_16x8_sse2:
  889|   548k|                                       const uint16_t *left, int bd) {
  890|   548k|  (void)bd;
  891|   548k|  __m128i sum_left = dc_sum_8(left);
  892|   548k|  __m128i sum_above = dc_sum_16(above);
  893|   548k|  const __m128i zero = _mm_setzero_si128();
  894|   548k|  sum_left = _mm_unpacklo_epi16(sum_left, zero);
  895|   548k|  sum_above = _mm_unpacklo_epi16(sum_above, zero);
  896|   548k|  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
  897|   548k|  uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
  898|   548k|  sum32 += 12;
  899|   548k|  sum32 /= 24;
  900|   548k|  const __m128i row = _mm_set1_epi16((int16_t)sum32);
  901|   548k|  int i;
  902|  1.64M|  for (i = 0; i < 2; ++i) {
  ------------------
  |  Branch (902:15): [True: 1.09M, False: 548k]
  ------------------
  903|  1.09M|    _mm_store_si128((__m128i *)dst, row);
  904|  1.09M|    _mm_store_si128((__m128i *)(dst + 8), row);
  905|  1.09M|    dst += stride;
  906|  1.09M|    _mm_store_si128((__m128i *)dst, row);
  907|  1.09M|    _mm_store_si128((__m128i *)(dst + 8), row);
  908|  1.09M|    dst += stride;
  909|  1.09M|    _mm_store_si128((__m128i *)dst, row);
  910|  1.09M|    _mm_store_si128((__m128i *)(dst + 8), row);
  911|  1.09M|    dst += stride;
  912|  1.09M|    _mm_store_si128((__m128i *)dst, row);
  913|  1.09M|    _mm_store_si128((__m128i *)(dst + 8), row);
  914|  1.09M|    dst += stride;
  915|  1.09M|  }
  916|   548k|}
aom_highbd_dc_predictor_16x32_sse2:
  920|   176k|                                        const uint16_t *left, int bd) {
  921|   176k|  (void)bd;
  922|   176k|  __m128i sum_left = dc_sum_32(left);
  923|   176k|  __m128i sum_above = dc_sum_16(above);
  924|   176k|  const __m128i zero = _mm_setzero_si128();
  925|   176k|  sum_above = _mm_unpacklo_epi16(sum_above, zero);
  926|   176k|  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
  927|   176k|  uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
  928|   176k|  sum32 += 24;
  929|   176k|  sum32 /= 48;
  930|   176k|  const __m128i row = _mm_set1_epi16((int16_t)sum32);
  931|   176k|  int i;
  932|  1.58M|  for (i = 0; i < 8; ++i) {
  ------------------
  |  Branch (932:15): [True: 1.40M, False: 176k]
  ------------------
  933|  1.40M|    _mm_store_si128((__m128i *)dst, row);
  934|  1.40M|    _mm_store_si128((__m128i *)(dst + 8), row);
  935|  1.40M|    dst += stride;
  936|  1.40M|    _mm_store_si128((__m128i *)dst, row);
  937|  1.40M|    _mm_store_si128((__m128i *)(dst + 8), row);
  938|  1.40M|    dst += stride;
  939|  1.40M|    _mm_store_si128((__m128i *)dst, row);
  940|  1.40M|    _mm_store_si128((__m128i *)(dst + 8), row);
  941|  1.40M|    dst += stride;
  942|  1.40M|    _mm_store_si128((__m128i *)dst, row);
  943|  1.40M|    _mm_store_si128((__m128i *)(dst + 8), row);
  944|  1.40M|    dst += stride;
  945|  1.40M|  }
  946|   176k|}
aom_highbd_dc_predictor_32x16_sse2:
  950|   183k|                                        const uint16_t *left, int bd) {
  951|   183k|  (void)bd;
  952|   183k|  __m128i sum_left = dc_sum_16(left);
  953|   183k|  __m128i sum_above = dc_sum_32(above);
  954|   183k|  const __m128i zero = _mm_setzero_si128();
  955|   183k|  sum_left = _mm_unpacklo_epi16(sum_left, zero);
  956|   183k|  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
  957|   183k|  uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
  958|   183k|  sum32 += 24;
  959|   183k|  sum32 /= 48;
  960|   183k|  const __m128i row = _mm_set1_epi16((int16_t)sum32);
  961|   183k|  int i;
  962|   915k|  for (i = 0; i < 4; ++i) {
  ------------------
  |  Branch (962:15): [True: 732k, False: 183k]
  ------------------
  963|   732k|    _mm_store_si128((__m128i *)dst, row);
  964|   732k|    _mm_store_si128((__m128i *)(dst + 8), row);
  965|   732k|    _mm_store_si128((__m128i *)(dst + 16), row);
  966|   732k|    _mm_store_si128((__m128i *)(dst + 24), row);
  967|   732k|    dst += stride;
  968|   732k|    _mm_store_si128((__m128i *)dst, row);
  969|   732k|    _mm_store_si128((__m128i *)(dst + 8), row);
  970|   732k|    _mm_store_si128((__m128i *)(dst + 16), row);
  971|   732k|    _mm_store_si128((__m128i *)(dst + 24), row);
  972|   732k|    dst += stride;
  973|   732k|    _mm_store_si128((__m128i *)dst, row);
  974|   732k|    _mm_store_si128((__m128i *)(dst + 8), row);
  975|   732k|    _mm_store_si128((__m128i *)(dst + 16), row);
  976|   732k|    _mm_store_si128((__m128i *)(dst + 24), row);
  977|   732k|    dst += stride;
  978|   732k|    _mm_store_si128((__m128i *)dst, row);
  979|   732k|    _mm_store_si128((__m128i *)(dst + 8), row);
  980|   732k|    _mm_store_si128((__m128i *)(dst + 16), row);
  981|   732k|    _mm_store_si128((__m128i *)(dst + 24), row);
  982|   732k|    dst += stride;
  983|   732k|  }
  984|   183k|}
highbd_intrapred_sse2.c:h_predictor_16x8:
  123|   787k|                                    const uint16_t *left) {
  124|   787k|  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
  125|   787k|  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
  126|   787k|  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
  127|   787k|  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
  128|   787k|  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
  129|   787k|  const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
  130|   787k|  const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
  131|   787k|  const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
  132|   787k|  const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
  133|   787k|  h_store_16_unpacklo(&dst, stride, &row0);
  134|   787k|  h_store_16_unpacklo(&dst, stride, &row1);
  135|   787k|  h_store_16_unpacklo(&dst, stride, &row2);
  136|   787k|  h_store_16_unpacklo(&dst, stride, &row3);
  137|   787k|  h_store_16_unpackhi(&dst, stride, &row4);
  138|   787k|  h_store_16_unpackhi(&dst, stride, &row5);
  139|   787k|  h_store_16_unpackhi(&dst, stride, &row6);
  140|   787k|  h_store_16_unpackhi(&dst, stride, &row7);
  141|   787k|}
highbd_intrapred_sse2.c:h_store_16_unpacklo:
  107|  3.14M|                                       const __m128i *row) {
  108|  3.14M|  const __m128i val = _mm_unpacklo_epi64(*row, *row);
  109|  3.14M|  _mm_store_si128((__m128i *)*dst, val);
  110|  3.14M|  _mm_store_si128((__m128i *)(*dst + 8), val);
  111|  3.14M|  *dst += stride;
  112|  3.14M|}
highbd_intrapred_sse2.c:h_store_16_unpackhi:
  115|  3.14M|                                       const __m128i *row) {
  116|  3.14M|  const __m128i val = _mm_unpackhi_epi64(*row, *row);
  117|  3.14M|  _mm_store_si128((__m128i *)(*dst), val);
  118|  3.14M|  _mm_store_si128((__m128i *)(*dst + 8), val);
  119|  3.14M|  *dst += stride;
  120|  3.14M|}
highbd_intrapred_sse2.c:h_predictor_32x8:
  198|   797k|                                    const uint16_t *left) {
  199|   797k|  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
  200|   797k|  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
  201|   797k|  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
  202|   797k|  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
  203|   797k|  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
  204|   797k|  const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
  205|   797k|  const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
  206|   797k|  const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
  207|   797k|  const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
  208|   797k|  h_store_32_unpacklo(&dst, stride, &row0);
  209|   797k|  h_store_32_unpacklo(&dst, stride, &row1);
  210|   797k|  h_store_32_unpacklo(&dst, stride, &row2);
  211|   797k|  h_store_32_unpacklo(&dst, stride, &row3);
  212|   797k|  h_store_32_unpackhi(&dst, stride, &row4);
  213|   797k|  h_store_32_unpackhi(&dst, stride, &row5);
  214|   797k|  h_store_32_unpackhi(&dst, stride, &row6);
  215|   797k|  h_store_32_unpackhi(&dst, stride, &row7);
  216|   797k|}
highbd_intrapred_sse2.c:h_store_32_unpacklo:
  178|  3.19M|                                       const __m128i *row) {
  179|  3.19M|  const __m128i val = _mm_unpacklo_epi64(*row, *row);
  180|  3.19M|  _mm_store_si128((__m128i *)(*dst), val);
  181|  3.19M|  _mm_store_si128((__m128i *)(*dst + 8), val);
  182|  3.19M|  _mm_store_si128((__m128i *)(*dst + 16), val);
  183|  3.19M|  _mm_store_si128((__m128i *)(*dst + 24), val);
  184|  3.19M|  *dst += stride;
  185|  3.19M|}
highbd_intrapred_sse2.c:h_store_32_unpackhi:
  188|  3.19M|                                       const __m128i *row) {
  189|  3.19M|  const __m128i val = _mm_unpackhi_epi64(*row, *row);
  190|  3.19M|  _mm_store_si128((__m128i *)(*dst), val);
  191|  3.19M|  _mm_store_si128((__m128i *)(*dst + 8), val);
  192|  3.19M|  _mm_store_si128((__m128i *)(*dst + 16), val);
  193|  3.19M|  _mm_store_si128((__m128i *)(*dst + 24), val);
  194|  3.19M|  *dst += stride;
  195|  3.19M|}
highbd_intrapred_sse2.c:dc_sum_4:
  249|  2.35M|static inline __m128i dc_sum_4(const uint16_t *ref) {
  250|  2.35M|  const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref);
  251|  2.35M|  const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
  252|  2.35M|  const __m128i a = _mm_add_epi16(_dcba, _xxdc);
  253|  2.35M|  return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
  254|  2.35M|}
highbd_intrapred_sse2.c:dc_store_4x4:
  257|  1.42M|                                const __m128i *dc) {
  258|  1.42M|  const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
  259|  1.42M|  int i;
  260|  7.11M|  for (i = 0; i < 4; ++i, dst += stride) {
  ------------------
  |  Branch (260:15): [True: 5.69M, False: 1.42M]
  ------------------
  261|  5.69M|    _mm_storel_epi64((__m128i *)dst, dc_dup);
  262|  5.69M|  }
  263|  1.42M|}
highbd_intrapred_sse2.c:dc_sum_8:
  310|  6.82M|static inline __m128i dc_sum_8(const uint16_t *ref) {
  311|  6.82M|  const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref);
  312|  6.82M|  const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8));
  313|  6.82M|  const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
  314|  6.82M|  const __m128i a = _mm_add_epi16(_dcba, _xxdc);
  315|       |
  316|  6.82M|  return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
  317|  6.82M|}
highbd_intrapred_sse2.c:dc_store_4x8:
  301|  20.3k|                                const __m128i *dc) {
  302|  20.3k|  const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
  303|  20.3k|  int i;
  304|   183k|  for (i = 0; i < 8; ++i, dst += stride) {
  ------------------
  |  Branch (304:15): [True: 162k, False: 20.3k]
  ------------------
  305|   162k|    _mm_storel_epi64((__m128i *)dst, dc_dup);
  306|   162k|  }
  307|  20.3k|}
highbd_intrapred_sse2.c:dc_top_predictor_8xh:
  368|  65.4k|                                        int height, const uint16_t *above) {
  369|  65.4k|  const __m128i four = _mm_cvtsi32_si128(4);
  370|  65.4k|  const __m128i sum = dc_sum_8(above);
  371|  65.4k|  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
  372|  65.4k|  dc_store_8xh(dst, stride, height, &dc);
  373|  65.4k|}
highbd_intrapred_sse2.c:dc_store_8xh:
  355|   123k|                                const __m128i *dc) {
  356|   123k|  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
  357|   123k|  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
  358|   123k|  int i;
  359|  1.10M|  for (i = 0; i < height; ++i, dst += stride) {
  ------------------
  |  Branch (359:15): [True: 979k, False: 123k]
  ------------------
  360|   979k|    _mm_store_si128((__m128i *)dst, dc_dup);
  361|   979k|  }
  362|   123k|}
highbd_intrapred_sse2.c:dc_sum_16:
  425|  2.45M|static inline __m128i dc_sum_16(const uint16_t *ref) {
  426|  2.45M|  const __m128i sum_lo = dc_sum_8(ref);
  427|  2.45M|  const __m128i sum_hi = dc_sum_8(ref + 8);
  428|  2.45M|  return _mm_add_epi16(sum_lo, sum_hi);
  429|  2.45M|}
highbd_intrapred_sse2.c:dc_128_predictor_8xh:
  446|  2.93k|                                        int height, int bd) {
  447|  2.93k|  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
  448|  2.93k|  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
  449|  2.93k|  dc_store_8xh(dst, stride, height, &dc_dup);
  450|  2.93k|}
highbd_intrapred_sse2.c:dc_store_16xh:
  480|   106k|                                 const __m128i *dc) {
  481|   106k|  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
  482|   106k|  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
  483|   106k|  int i;
  484|  2.03M|  for (i = 0; i < height; ++i, dst += stride) {
  ------------------
  |  Branch (484:15): [True: 1.93M, False: 106k]
  ------------------
  485|  1.93M|    _mm_store_si128((__m128i *)dst, dc_dup);
  486|  1.93M|    _mm_store_si128((__m128i *)(dst + 8), dc_dup);
  487|  1.93M|  }
  488|   106k|}
highbd_intrapred_sse2.c:dc_sum_32:
  516|   565k|static inline __m128i dc_sum_32(const uint16_t *ref) {
  517|   565k|  const __m128i zero = _mm_setzero_si128();
  518|   565k|  const __m128i sum_a = dc_sum_16(ref);
  519|   565k|  const __m128i sum_b = dc_sum_16(ref + 16);
  520|       |  // 12 bit bd will outrange, so expand to 32 bit before adding final total
  521|   565k|  return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero),
  522|   565k|                       _mm_unpacklo_epi16(sum_b, zero));
  523|   565k|}
highbd_intrapred_sse2.c:dc_store_32xh:
  609|   247k|                                 const __m128i *dc) {
  610|   247k|  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
  611|   247k|  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
  612|   247k|  int i;
  613|  7.45M|  for (i = 0; i < height; ++i, dst += stride) {
  ------------------
  |  Branch (613:15): [True: 7.20M, False: 247k]
  ------------------
  614|  7.20M|    _mm_store_si128((__m128i *)dst, dc_dup);
  615|  7.20M|    _mm_store_si128((__m128i *)(dst + 8), dc_dup);
  616|  7.20M|    _mm_store_si128((__m128i *)(dst + 16), dc_dup);
  617|  7.20M|    _mm_store_si128((__m128i *)(dst + 24), dc_dup);
  618|  7.20M|  }
  619|   247k|}

aom_highbd_lpf_horizontal_14_sse2:
  502|  8.22M|                                       const uint8_t *thresh, int bd) {
  503|  8.22M|  __m128i p[7], q[7], pq[7];
  504|  8.22M|  int i;
  505|       |
  506|  62.7M|  for (i = 0; i < 7; i++) {
  ------------------
  |  Branch (506:15): [True: 54.5M, False: 8.22M]
  ------------------
  507|  54.5M|    p[i] = _mm_loadl_epi64((__m128i *)(s - (i + 1) * pitch));
  508|  54.5M|    q[i] = _mm_loadl_epi64((__m128i *)(s + i * pitch));
  509|  54.5M|  }
  510|       |
  511|  8.22M|  highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd);
  512|       |
  513|  44.6M|  for (i = 0; i < 6; i++) {
  ------------------
  |  Branch (513:15): [True: 36.4M, False: 8.22M]
  ------------------
  514|  36.4M|    _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), pq[i]);
  515|  36.4M|    _mm_storel_epi64((__m128i *)(s + i * pitch), _mm_srli_si128(pq[i], 8));
  516|  36.4M|  }
  517|  8.22M|}
aom_highbd_lpf_horizontal_6_sse2:
  952|  6.63M|                                      const uint8_t *_thresh, int bd) {
  953|  6.63M|  __m128i p2, p1, p0, q0, q1, q2, p1p0_out, q1q0_out;
  954|       |
  955|  6.63M|  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
  956|  6.63M|  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
  957|  6.63M|  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
  958|  6.63M|  q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
  959|  6.63M|  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
  960|  6.63M|  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
  961|       |
  962|  6.63M|  highbd_lpf_internal_6_sse2(&p2, &p1, &p0, &q0, &q1, &q2, &p1p0_out, &q1q0_out,
  963|  6.63M|                             _blimit, _limit, _thresh, bd);
  964|       |
  965|  6.63M|  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0_out, 8));
  966|  6.63M|  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0_out);
  967|  6.63M|  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0_out);
  968|  6.63M|  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0_out, 8));
  969|  6.63M|}
aom_highbd_lpf_horizontal_8_sse2:
 1223|  3.09M|                                      const uint8_t *_thresh, int bd) {
 1224|  3.09M|  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
 1225|  3.09M|  __m128i q1q0, p1p0;
 1226|       |
 1227|  3.09M|  p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
 1228|  3.09M|  q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
 1229|  3.09M|  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
 1230|  3.09M|  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
 1231|  3.09M|  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
 1232|  3.09M|  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
 1233|  3.09M|  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
 1234|  3.09M|  q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
 1235|       |
 1236|  3.09M|  highbd_lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0,
 1237|  3.09M|                             &p1p0, _blimit, _limit, _thresh, bd);
 1238|       |
 1239|  3.09M|  _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
 1240|  3.09M|  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
 1241|  3.09M|  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
 1242|  3.09M|  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
 1243|  3.09M|  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
 1244|  3.09M|  _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
 1245|  3.09M|}
aom_highbd_lpf_horizontal_4_sse2:
 1348|  40.3M|                                      const uint8_t *_thresh, int bd) {
 1349|  40.3M|  __m128i p1p0, q1q0;
 1350|  40.3M|  __m128i p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
 1351|  40.3M|  __m128i p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
 1352|  40.3M|  __m128i q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
 1353|  40.3M|  __m128i q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
 1354|       |
 1355|  40.3M|  highbd_lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &q1q0, &p1p0, _blimit, _limit,
 1356|  40.3M|                             _thresh, bd);
 1357|       |
 1358|  40.3M|  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
 1359|  40.3M|  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
 1360|  40.3M|  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
 1361|  40.3M|  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
 1362|  40.3M|}
aom_highbd_lpf_vertical_4_sse2:
 1385|  35.9M|                                    int bd) {
 1386|  35.9M|  __m128i x0, x1, x2, x3, d0, d1, d2, d3;
 1387|  35.9M|  __m128i p1p0, q1q0;
 1388|  35.9M|  __m128i p1, q1;
 1389|       |
 1390|  35.9M|  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
 1391|  35.9M|  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
 1392|  35.9M|  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
 1393|  35.9M|  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
 1394|       |
 1395|  35.9M|  highbd_transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &d0, &d1, &d2, &d3);
 1396|       |
 1397|  35.9M|  highbd_lpf_internal_4_sse2(&d0, &d1, &d2, &d3, &q1q0, &p1p0, blimit, limit,
 1398|  35.9M|                             thresh, bd);
 1399|       |
 1400|  35.9M|  p1 = _mm_srli_si128(p1p0, 8);
 1401|  35.9M|  q1 = _mm_srli_si128(q1q0, 8);
 1402|       |
 1403|       |  // transpose from 8x4 to 4x8
 1404|  35.9M|  highbd_transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
 1405|       |
 1406|  35.9M|  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
 1407|  35.9M|  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
 1408|  35.9M|  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
 1409|  35.9M|  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
 1410|  35.9M|}
aom_highbd_lpf_vertical_6_sse2:
 1450|  5.33M|                                    int bd) {
 1451|  5.33M|  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
 1452|  5.33M|  __m128i x3, x2, x1, x0, p0, q0;
 1453|  5.33M|  __m128i p1p0, q1q0;
 1454|       |
 1455|  5.33M|  x3 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p));
 1456|  5.33M|  x2 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
 1457|  5.33M|  x1 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
 1458|  5.33M|  x0 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
 1459|       |
 1460|  5.33M|  highbd_transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5,
 1461|  5.33M|                               &d6, &d7);
 1462|       |
 1463|  5.33M|  highbd_lpf_internal_6_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &p1p0, &q1q0, blimit,
 1464|  5.33M|                             limit, thresh, bd);
 1465|       |
 1466|  5.33M|  p0 = _mm_srli_si128(p1p0, 8);
 1467|  5.33M|  q0 = _mm_srli_si128(q1q0, 8);
 1468|       |
 1469|  5.33M|  highbd_transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
 1470|       |
 1471|  5.33M|  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
 1472|  5.33M|  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
 1473|  5.33M|  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
 1474|  5.33M|  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
 1475|  5.33M|}
aom_highbd_lpf_vertical_8_sse2:
 1516|   496k|                                    int bd) {
 1517|   496k|  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
 1518|   496k|  __m128i p2, p1, p0, p3, q0;
 1519|   496k|  __m128i q1q0, p1p0;
 1520|       |
 1521|   496k|  p3 = _mm_loadu_si128((__m128i *)((s - 4) + 0 * p));
 1522|   496k|  p2 = _mm_loadu_si128((__m128i *)((s - 4) + 1 * p));
 1523|   496k|  p1 = _mm_loadu_si128((__m128i *)((s - 4) + 2 * p));
 1524|   496k|  p0 = _mm_loadu_si128((__m128i *)((s - 4) + 3 * p));
 1525|       |
 1526|   496k|  highbd_transpose4x8_8x4_sse2(&p3, &p2, &p1, &p0, &d0, &d1, &d2, &d3, &d4, &d5,
 1527|   496k|                               &d6, &d7);
 1528|       |
 1529|       |  // Loop filtering
 1530|   496k|  highbd_lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0,
 1531|   496k|                             &p1p0, blimit, limit, thresh, bd);
 1532|       |
 1533|   496k|  p0 = _mm_srli_si128(p1p0, 8);
 1534|   496k|  q0 = _mm_srli_si128(q1q0, 8);
 1535|       |
 1536|   496k|  highbd_transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0,
 1537|   496k|                               &d1, &d2, &d3);
 1538|       |
 1539|   496k|  _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), d0);
 1540|   496k|  _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), d1);
 1541|   496k|  _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), d2);
 1542|   496k|  _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), d3);
 1543|   496k|}
aom_highbd_lpf_vertical_14_sse2:
 1584|  6.88M|                                     const uint8_t *thresh, int bd) {
 1585|  6.88M|  __m128i q[7], p[7], pq[7];
 1586|  6.88M|  __m128i p6, p5, p4, p3;
 1587|  6.88M|  __m128i p6_2, p5_2, p4_2, p3_2;
 1588|  6.88M|  __m128i d0, d1, d2, d3;
 1589|  6.88M|  __m128i d0_2, d1_2, d2_2, d3_2, d7_2;
 1590|       |
 1591|  6.88M|  p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
 1592|  6.88M|  p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
 1593|  6.88M|  p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
 1594|  6.88M|  p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
 1595|       |
 1596|  6.88M|  highbd_transpose4x8_8x4_sse2(&p6, &p5, &p4, &p3, &d0, &p[6], &p[5], &p[4],
 1597|  6.88M|                               &p[3], &p[2], &p[1], &p[0]);
 1598|       |
 1599|  6.88M|  p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
 1600|  6.88M|  p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
 1601|  6.88M|  p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
 1602|  6.88M|  p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
 1603|       |
 1604|  6.88M|  highbd_transpose4x8_8x4_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &q[0], &q[1], &q[2],
 1605|  6.88M|                               &q[3], &q[4], &q[5], &q[6], &d7_2);
 1606|       |
 1607|  6.88M|  highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd);
 1608|       |
 1609|  6.88M|  highbd_transpose8x8_low_sse2(&d0, &p[6], &pq[5], &pq[4], &pq[3], &pq[2],
 1610|  6.88M|                               &pq[1], &pq[0], &d0, &d1, &d2, &d3);
 1611|       |
 1612|  6.88M|  q[0] = _mm_srli_si128(pq[0], 8);
 1613|  6.88M|  q[1] = _mm_srli_si128(pq[1], 8);
 1614|  6.88M|  q[2] = _mm_srli_si128(pq[2], 8);
 1615|  6.88M|  q[3] = _mm_srli_si128(pq[3], 8);
 1616|  6.88M|  q[4] = _mm_srli_si128(pq[4], 8);
 1617|  6.88M|  q[5] = _mm_srli_si128(pq[5], 8);
 1618|       |
 1619|  6.88M|  highbd_transpose8x8_low_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6],
 1620|  6.88M|                               &d7_2, &d0_2, &d1_2, &d2_2, &d3_2);
 1621|       |
 1622|  6.88M|  _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0);
 1623|  6.88M|  _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_2);
 1624|       |
 1625|  6.88M|  _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1);
 1626|  6.88M|  _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_2);
 1627|       |
 1628|  6.88M|  _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2);
 1629|  6.88M|  _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_2);
 1630|       |
 1631|  6.88M|  _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3);
 1632|  6.88M|  _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_2);
 1633|  6.88M|}
highbd_loopfilter_sse2.c:highbd_lpf_internal_14_sse2:
  328|  14.0M|    const unsigned char *lt, const unsigned char *thr, int bd) {
  329|  14.0M|  int i;
  330|  14.0M|  const __m128i zero = _mm_setzero_si128();
  331|  14.0M|  __m128i blimit, limit, thresh;
  332|  14.0M|  __m128i t80;
  333|  14.0M|  get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80);
  334|       |
  335|   105M|  for (i = 0; i < 7; i++) {
  ------------------
  |  Branch (335:15): [True: 91.6M, False: 14.0M]
  ------------------
  336|  91.6M|    pq[i] = _mm_unpacklo_epi64(p[i], q[i]);
  337|  91.6M|  }
  338|  14.0M|  __m128i mask, hevhev;
  339|  14.0M|  __m128i p1p0, q1q0, abs_p1p0;
  340|       |
  341|  14.0M|  highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
  342|  14.0M|                                &thresh, &hevhev, &mask);
  343|       |
  344|  14.0M|  __m128i ps0ps1, qs0qs1;
  345|       |  // filter4
  346|  14.0M|  highbd_filter4_sse2(&p1p0, &q1q0, &hevhev, &mask, &qs0qs1, &ps0ps1, &t80, bd);
  347|       |
  348|  14.0M|  __m128i flat, flat2;
  349|  14.0M|  highbd_flat_mask4_sse2(pq, &flat, &flat2, bd);
  350|       |
  351|  14.0M|  flat = _mm_and_si128(flat, mask);
  352|  14.0M|  flat2 = _mm_and_si128(flat2, flat);
  353|       |
  354|       |  // replicate for the further "merged variables" usage
  355|  14.0M|  flat = _mm_unpacklo_epi64(flat, flat);
  356|  14.0M|  flat2 = _mm_unpacklo_epi64(flat2, flat2);
  357|       |
  358|       |  // flat and wide flat calculations
  359|       |
  360|       |  // if flat ==0 then flat2 is zero as well and we don't need any calc below
  361|       |  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
  362|  14.0M|  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
  ------------------
  |  Branch (362:7): [True: 7.33M, False: 6.69M]
  ------------------
  363|  7.33M|    __m128i flat_p[3], flat_q[3], flat_pq[3];
  364|  7.33M|    __m128i flat2_p[6], flat2_q[6];
  365|  7.33M|    __m128i flat2_pq[6];
  366|  7.33M|    __m128i sum_p6, sum_p3;
  367|  7.33M|    const __m128i eight = _mm_set1_epi16(8);
  368|  7.33M|    const __m128i four = _mm_set1_epi16(4);
  369|       |
  370|  7.33M|    __m128i work0, work0_0, work0_1, sum_p_0;
  371|  7.33M|    __m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3]));
  372|  7.33M|    __m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1]));
  373|  7.33M|    sum_p = _mm_add_epi16(sum_p, sum_lp);
  374|       |
  375|  7.33M|    __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
  376|  7.33M|    __m128i sum_q = _mm_srli_si128(sum_p, 8);
  377|       |
  378|  7.33M|    sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
  379|  7.33M|    sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
  380|       |
  381|  7.33M|    flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq[3], pq[0]));
  382|  7.33M|    flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0]));
  383|       |
  384|  7.33M|    sum_p6 = _mm_add_epi16(pq[6], pq[6]);
  385|  7.33M|    sum_p3 = _mm_add_epi16(pq[3], pq[3]);
  386|       |
  387|  7.33M|    sum_q = _mm_sub_epi16(sum_p_0, pq[5]);
  388|  7.33M|    sum_p = _mm_sub_epi16(sum_p_0, q[5]);
  389|       |
  390|  7.33M|    work0_0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]);
  391|  7.33M|    work0_1 = _mm_add_epi16(sum_p6,
  392|  7.33M|                            _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0])));
  393|       |
  394|  7.33M|    sum_lq = _mm_sub_epi16(sum_lp, pq[2]);
  395|  7.33M|    sum_lp = _mm_sub_epi16(sum_lp, q[2]);
  396|       |
  397|  7.33M|    work0 = _mm_add_epi16(sum_p3, pq[1]);
  398|  7.33M|    flat_p[1] = _mm_add_epi16(sum_lp, work0);
  399|  7.33M|    flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
  400|       |
  401|  7.33M|    flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
  402|  7.33M|    flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
  403|       |
  404|  7.33M|    sum_lp = _mm_sub_epi16(sum_lp, q[1]);
  405|  7.33M|    sum_lq = _mm_sub_epi16(sum_lq, pq[1]);
  406|       |
  407|  7.33M|    sum_p3 = _mm_add_epi16(sum_p3, pq[3]);
  408|  7.33M|    work0 = _mm_add_epi16(sum_p3, pq[2]);
  409|       |
  410|  7.33M|    flat_p[2] = _mm_add_epi16(sum_lp, work0);
  411|  7.33M|    flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
  412|  7.33M|    flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
  413|       |
  414|  7.33M|    int flat2_mask =
  415|  7.33M|        (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
  416|  7.33M|    if (flat2_mask) {
  ------------------
  |  Branch (416:9): [True: 6.54M, False: 791k]
  ------------------
  417|  6.54M|      flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q[0]));
  418|  6.54M|      flat2_q[0] = _mm_add_epi16(
  419|  6.54M|          sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq[0]));
  420|       |
  421|  6.54M|      flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
  422|  6.54M|      flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
  423|       |
  424|  6.54M|      flat2_pq[0] =
  425|  6.54M|          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
  426|  6.54M|      flat2_pq[1] =
  427|  6.54M|          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
  428|       |
  429|  6.54M|      sum_p = _mm_sub_epi16(sum_p, q[4]);
  430|  6.54M|      sum_q = _mm_sub_epi16(sum_q, pq[4]);
  431|       |
  432|  6.54M|      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
  433|  6.54M|      work0 = _mm_add_epi16(sum_p6,
  434|  6.54M|                            _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1])));
  435|  6.54M|      flat2_p[2] = _mm_add_epi16(sum_p, work0);
  436|  6.54M|      flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
  437|  6.54M|      flat2_pq[2] =
  438|  6.54M|          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
  439|       |
  440|  6.54M|      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
  441|  6.54M|      sum_p = _mm_sub_epi16(sum_p, q[3]);
  442|  6.54M|      sum_q = _mm_sub_epi16(sum_q, pq[3]);
  443|       |
  444|  6.54M|      work0 = _mm_add_epi16(sum_p6,
  445|  6.54M|                            _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2])));
  446|  6.54M|      flat2_p[3] = _mm_add_epi16(sum_p, work0);
  447|  6.54M|      flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
  448|  6.54M|      flat2_pq[3] =
  449|  6.54M|          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
  450|       |
  451|  6.54M|      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
  452|  6.54M|      sum_p = _mm_sub_epi16(sum_p, q[2]);
  453|  6.54M|      sum_q = _mm_sub_epi16(sum_q, pq[2]);
  454|       |
  455|  6.54M|      work0 = _mm_add_epi16(sum_p6,
  456|  6.54M|                            _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3])));
  457|  6.54M|      flat2_p[4] = _mm_add_epi16(sum_p, work0);
  458|  6.54M|      flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
  459|  6.54M|      flat2_pq[4] =
  460|  6.54M|          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
  461|       |
  462|  6.54M|      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
  463|  6.54M|      sum_p = _mm_sub_epi16(sum_p, q[1]);
  464|  6.54M|      sum_q = _mm_sub_epi16(sum_q, pq[1]);
  465|       |
  466|  6.54M|      work0 = _mm_add_epi16(sum_p6,
  467|  6.54M|                            _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4])));
  468|  6.54M|      flat2_p[5] = _mm_add_epi16(sum_p, work0);
  469|  6.54M|      flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
  470|  6.54M|      flat2_pq[5] =
  471|  6.54M|          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
  472|  6.54M|    }  // flat2
  473|       |       // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  474|       |    // highbd_filter8
  475|  7.33M|    pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
  476|  7.33M|    pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
  477|       |
  478|  29.0M|    for (i = 0; i < 3; i++) {
  ------------------
  |  Branch (478:17): [True: 21.7M, False: 7.33M]
  ------------------
  479|  21.7M|      pq[i] = _mm_andnot_si128(flat, pq[i]);
  480|  21.7M|      flat_pq[i] = _mm_and_si128(flat, flat_pq[i]);
  481|  21.7M|      pq[i] = _mm_or_si128(pq[i], flat_pq[i]);
  482|  21.7M|    }
  483|       |
  484|       |    // wide flat
  485|       |    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  486|  7.33M|    if (flat2_mask) {
  ------------------
  |  Branch (486:9): [True: 6.17M, False: 1.15M]
  ------------------
  487|  43.2M|      for (i = 0; i < 6; i++) {
  ------------------
  |  Branch (487:19): [True: 37.0M, False: 6.17M]
  ------------------
  488|  37.0M|        pq[i] = _mm_andnot_si128(flat2, pq[i]);
  489|  37.0M|        flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]);
  490|  37.0M|        pq[i] = _mm_or_si128(pq[i], flat2_pq[i]);  // full list of pq values
  491|  37.0M|      }
  492|  6.17M|    }
  493|  7.33M|  } else {
  494|  6.69M|    pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
  495|  6.69M|    pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
  496|  6.69M|  }
  497|  14.0M|}
highbd_loopfilter_sse2.c:get_limit:
   30|  88.8M|                             __m128i *lt, __m128i *thr, __m128i *t80_out) {
   31|  88.8M|  const int shift = bd - 8;
   32|  88.8M|  const __m128i zero = _mm_setzero_si128();
   33|       |
   34|  88.8M|  __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero);
   35|  88.8M|  *blt = _mm_slli_epi16(x, shift);
   36|       |
   37|  88.8M|  x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero);
   38|  88.8M|  *lt = _mm_slli_epi16(x, shift);
   39|       |
   40|  88.8M|  x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero);
   41|  88.8M|  *thr = _mm_slli_epi16(x, shift);
   42|       |
   43|  88.8M|  *t80_out = _mm_set1_epi16(1 << (bd - 1));
   44|  88.8M|}
highbd_loopfilter_sse2.c:highbd_hev_filter_mask_x_sse2:
  112|  78.9M|                                                 __m128i *hev, __m128i *mask) {
  113|  78.9M|  const __m128i zero = _mm_setzero_si128();
  114|  78.9M|  const __m128i one = _mm_set1_epi16(1);
  115|  78.9M|  const __m128i ffff = _mm_set1_epi16((short)0xFFFF);
  116|  78.9M|  __m128i abs_p0q0_p1q1, abs_p0q0, abs_p1q1, abs_q1q0;
  117|  78.9M|  __m128i max, max01, h;
  118|       |
  119|  78.9M|  *p1p0 = _mm_unpacklo_epi64(pq[0], pq[1]);
  120|  78.9M|  *q1q0 = _mm_unpackhi_epi64(pq[0], pq[1]);
  121|       |
  122|  78.9M|  abs_p0q0_p1q1 = abs_diff16(*p1p0, *q1q0);
  123|  78.9M|  abs_p0q0 = _mm_adds_epu16(abs_p0q0_p1q1, abs_p0q0_p1q1);
  124|  78.9M|  abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
  125|       |
  126|  78.9M|  abs_p1q1 = _mm_srli_si128(abs_p0q0_p1q1, 8);
  127|  78.9M|  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);  // divide by 2
  128|       |
  129|  78.9M|  max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
  130|  78.9M|  max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
  131|       |  // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2  > blimit) * -1;
  132|       |  // So taking maximums continues to work:
  133|  78.9M|  max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
  134|       |
  135|  78.9M|  *abs_p1p0 = abs_diff16(pq[0], pq[1]);
  136|  78.9M|  abs_q1q0 = _mm_srli_si128(*abs_p1p0, 8);
  137|  78.9M|  max01 = _mm_max_epi16(*abs_p1p0, abs_q1q0);
  138|       |  // mask |= (abs(*p1 - *p0) > limit) * -1;
  139|       |  // mask |= (abs(*q1 - *q0) > limit) * -1;
  140|  78.9M|  h = _mm_subs_epu16(max01, *t);
  141|       |
  142|  78.9M|  *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
  143|       |  // replicate for the further "merged variables" usage
  144|  78.9M|  *hev = _mm_unpacklo_epi64(*hev, *hev);
  145|       |
  146|  78.9M|  max = _mm_max_epi16(max, max01);
  147|  78.9M|  int i;
  148|   114M|  for (i = 2; i < x; ++i) {
  ------------------
  |  Branch (148:15): [True: 35.9M, False: 78.9M]
  ------------------
  149|  35.9M|    max = _mm_max_epi16(max, abs_diff16(pq[i], pq[i - 1]));
  150|  35.9M|  }
  151|  78.9M|  max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
  152|       |
  153|  78.9M|  max = _mm_subs_epu16(max, *l);
  154|  78.9M|  *mask = _mm_cmpeq_epi16(max, zero);  //  ~mask
  155|  78.9M|}
highbd_loopfilter_sse2.c:abs_diff16:
   24|   253M|static AOM_FORCE_INLINE __m128i abs_diff16(__m128i a, __m128i b) {
   25|   253M|  return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
   26|   253M|}
highbd_loopfilter_sse2.c:highbd_filter4_sse2:
  217|  76.7M|                                                 int bd) {
  218|  76.7M|  const __m128i zero = _mm_setzero_si128();
  219|  76.7M|  const __m128i one = _mm_set1_epi16(1);
  220|  76.7M|  const __m128i pmax =
  221|  76.7M|      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80);
  222|  76.7M|  const __m128i pmin = _mm_subs_epi16(zero, *t80);
  223|       |
  224|  76.7M|  const __m128i t3t4 = _mm_set_epi16(3, 3, 3, 3, 4, 4, 4, 4);
  225|  76.7M|  __m128i ps1ps0_work, qs1qs0_work, work;
  226|  76.7M|  __m128i filt, filter2filter1, filter2filt, filter1filt;
  227|       |
  228|  76.7M|  ps1ps0_work = _mm_subs_epi16(*p1p0, *t80);
  229|  76.7M|  qs1qs0_work = _mm_subs_epi16(*q1q0, *t80);
  230|       |
  231|  76.7M|  work = _mm_subs_epi16(ps1ps0_work, qs1qs0_work);
  232|  76.7M|  pixel_clamp(&pmin, &pmax, &work);
  233|  76.7M|  filt = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
  234|       |
  235|  76.7M|  filt = _mm_subs_epi16(filt, work);
  236|  76.7M|  filt = _mm_subs_epi16(filt, work);
  237|  76.7M|  filt = _mm_subs_epi16(filt, work);
  238|       |  // (aom_filter + 3 * (qs0 - ps0)) & mask
  239|  76.7M|  pixel_clamp(&pmin, &pmax, &filt);
  240|  76.7M|  filt = _mm_and_si128(filt, *mask);
  241|  76.7M|  filt = _mm_unpacklo_epi64(filt, filt);
  242|       |
  243|  76.7M|  filter2filter1 = _mm_adds_epi16(filt, t3t4); /* signed_short_clamp */
  244|  76.7M|  pixel_clamp(&pmin, &pmax, &filter2filter1);
  245|  76.7M|  filter2filter1 = _mm_srai_epi16(filter2filter1, 3); /* >> 3 */
  246|       |
  247|  76.7M|  filt = _mm_unpacklo_epi64(filter2filter1, filter2filter1);
  248|       |
  249|       |  // filt >> 1
  250|  76.7M|  filt = _mm_adds_epi16(filt, one);
  251|  76.7M|  filt = _mm_srai_epi16(filt, 1);
  252|  76.7M|  filt = _mm_andnot_si128(*hev, filt);
  253|       |
  254|  76.7M|  filter2filt = _mm_unpackhi_epi64(filter2filter1, filt);
  255|  76.7M|  filter1filt = _mm_unpacklo_epi64(filter2filter1, filt);
  256|       |
  257|  76.7M|  qs1qs0_work = _mm_subs_epi16(qs1qs0_work, filter1filt);
  258|  76.7M|  ps1ps0_work = _mm_adds_epi16(ps1ps0_work, filter2filt);
  259|       |
  260|  76.7M|  pixel_clamp(&pmin, &pmax, &qs1qs0_work);
  261|  76.7M|  pixel_clamp(&pmin, &pmax, &ps1ps0_work);
  262|       |
  263|  76.7M|  *qs1qs0 = _mm_adds_epi16(qs1qs0_work, *t80);
  264|  76.7M|  *ps1ps0 = _mm_adds_epi16(ps1ps0_work, *t80);
  265|  76.7M|}
highbd_loopfilter_sse2.c:pixel_clamp:
   19|   376M|                                         __m128i *pixel) {
   20|   376M|  *pixel = _mm_min_epi16(*pixel, *max);
   21|   376M|  *pixel = _mm_max_epi16(*pixel, *min);
   22|   376M|}
highbd_loopfilter_sse2.c:highbd_flat_mask4_sse2:
  195|  13.0M|                                          __m128i *flat2, int bd) {
  196|       |  // check the distance 1,2,3 against 0
  197|  13.0M|  __m128i th = _mm_set1_epi16(1);
  198|  13.0M|  th = _mm_slli_epi16(th, bd - 8);
  199|  13.0M|  flat_mask_internal(&th, pq, 1, 4, flat);
  200|  13.0M|  flat_mask_internal(&th, pq, 4, 7, flat2);
  201|  13.0M|}
highbd_loopfilter_sse2.c:flat_mask_internal:
  158|  27.8M|                                      int start, int end, __m128i *flat) {
  159|  27.8M|  int i;
  160|  27.8M|  __m128i max = _mm_max_epi16(abs_diff16(pq[start], pq[0]),
  161|  27.8M|                              abs_diff16(pq[start + 1], pq[0]));
  162|       |
  163|  55.7M|  for (i = start + 2; i < end; ++i) {
  ------------------
  |  Branch (163:23): [True: 27.8M, False: 27.8M]
  ------------------
  164|  27.8M|    max = _mm_max_epi16(max, abs_diff16(pq[i], pq[0]));
  165|  27.8M|  }
  166|  27.8M|  max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
  167|       |
  168|  27.8M|  __m128i ft;
  169|  27.8M|  ft = _mm_subs_epu16(max, *th);
  170|       |
  171|  27.8M|  const __m128i zero = _mm_setzero_si128();
  172|  27.8M|  *flat = _mm_cmpeq_epi16(ft, zero);
  173|  27.8M|}
highbd_loopfilter_sse2.c:highbd_lpf_internal_6_sse2:
  739|  11.5M|    const uint8_t *_limit, const uint8_t *_thresh, int bd) {
  740|  11.5M|  __m128i blimit, limit, thresh;
  741|  11.5M|  __m128i mask, hev, flat;
  742|  11.5M|  __m128i pq[3];
  743|  11.5M|  __m128i p1p0, q1q0, abs_p1p0, ps1ps0, qs1qs0;
  744|  11.5M|  __m128i flat_p1p0, flat_q0q1;
  745|       |
  746|  11.5M|  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
  747|  11.5M|  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
  748|  11.5M|  pq[2] = _mm_unpacklo_epi64(*p2, *q2);
  749|       |
  750|  11.5M|  const __m128i zero = _mm_setzero_si128();
  751|  11.5M|  const __m128i four = _mm_set1_epi16(4);
  752|  11.5M|  __m128i t80;
  753|  11.5M|  const __m128i one = _mm_set1_epi16(0x1);
  754|       |
  755|  11.5M|  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
  756|       |
  757|  11.5M|  highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
  758|  11.5M|                                &thresh, &hev, &mask);
  759|       |
  760|       |  // lp filter
  761|  11.5M|  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
  762|       |
  763|       |  // flat_mask
  764|  11.5M|  flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0);
  765|  11.5M|  flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
  766|       |
  767|  11.5M|  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
  768|       |
  769|  11.5M|  flat = _mm_cmpeq_epi16(flat, zero);
  770|  11.5M|  flat = _mm_and_si128(flat, mask);
  771|       |  // replicate for the further "merged variables" usage
  772|  11.5M|  flat = _mm_unpacklo_epi64(flat, flat);
  773|       |
  774|       |  // 5 tap filter
  775|       |  // need it only if flat !=0
  776|  11.5M|  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
  ------------------
  |  Branch (776:7): [True: 7.22M, False: 4.30M]
  ------------------
  777|  7.22M|    __m128i workp_a, workp_b, workp_c;
  778|  7.22M|    __m128i pq0x2_pq1, pq1_pq2;
  779|       |
  780|       |    // op1
  781|  7.22M|    pq0x2_pq1 =
  782|  7.22M|        _mm_add_epi16(_mm_add_epi16(pq[0], pq[0]), pq[1]);  // p0 *2 + p1
  783|  7.22M|    pq1_pq2 = _mm_add_epi16(pq[1], pq[2]);                  // p1 + p2
  784|  7.22M|    workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
  785|  7.22M|                            pq1_pq2);  // p2 + p0 * 2 + p1 * 2 + 4
  786|       |
  787|  7.22M|    workp_b = _mm_add_epi16(_mm_add_epi16(pq[2], pq[2]), *q0);
  788|  7.22M|    workp_b =
  789|  7.22M|        _mm_add_epi16(workp_a, workp_b);  // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
  790|       |
  791|       |    // op0
  792|  7.22M|    workp_c = _mm_srli_si128(pq0x2_pq1, 8);  // q0 * 2 + q1
  793|  7.22M|    workp_a = _mm_add_epi16(workp_a,
  794|  7.22M|                            workp_c);  // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
  795|  7.22M|    workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
  796|  7.22M|    flat_p1p0 = _mm_srli_epi16(workp_b, 3);
  797|       |
  798|       |    // oq0
  799|  7.22M|    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[2]),
  800|  7.22M|                            pq[1]);  // p0 * 2 + p1  + q0 * 2 + q1 + 4
  801|  7.22M|    workp_b = _mm_srli_si128(pq1_pq2, 8);
  802|  7.22M|    workp_a = _mm_add_epi16(
  803|  7.22M|        workp_a, workp_b);  // p0 * 2 + p1  + q0 * 2 + q1 * 2 + q2 + 4
  804|       |    // workp_shft0 = _mm_srli_epi16(workp_a, 3);
  805|       |
  806|       |    // oq1
  807|  7.22M|    workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[1]),
  808|  7.22M|                            pq[0]);  // p0   + q0 * 2 + q1 * 2 + q2 + 4
  809|  7.22M|    workp_b = _mm_add_epi16(*q2, *q2);
  810|  7.22M|    workp_b =
  811|  7.22M|        _mm_add_epi16(workp_c, workp_b);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
  812|       |
  813|  7.22M|    workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
  814|  7.22M|    flat_q0q1 = _mm_srli_epi16(workp_a, 3);
  815|       |
  816|  7.22M|    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
  817|  7.22M|    q1q0 = _mm_and_si128(flat, flat_q0q1);
  818|  7.22M|    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
  819|       |
  820|  7.22M|    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
  821|  7.22M|    p1p0 = _mm_and_si128(flat, flat_p1p0);
  822|  7.22M|    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
  823|  7.22M|  }
  824|  11.5M|}
highbd_loopfilter_sse2.c:highbd_lpf_internal_8_sse2:
  998|  3.47M|    const unsigned char *_thresh, int bd) {
  999|  3.47M|  const __m128i zero = _mm_setzero_si128();
 1000|  3.47M|  __m128i blimit, limit, thresh;
 1001|  3.47M|  __m128i mask, hev, flat;
 1002|  3.47M|  __m128i pq[4];
 1003|  3.47M|  __m128i p1p0, q1q0, ps1ps0, qs1qs0;
 1004|  3.47M|  __m128i work_a, opq2, flat_p1p0, flat_q0q1;
 1005|       |
 1006|  3.47M|  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
 1007|  3.47M|  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
 1008|  3.47M|  pq[2] = _mm_unpacklo_epi64(*p2, *q2);
 1009|  3.47M|  pq[3] = _mm_unpacklo_epi64(*p3, *q3);
 1010|       |
 1011|  3.47M|  __m128i abs_p1p0;
 1012|       |
 1013|  3.47M|  const __m128i four = _mm_set1_epi16(4);
 1014|  3.47M|  __m128i t80;
 1015|  3.47M|  const __m128i one = _mm_set1_epi16(0x1);
 1016|       |
 1017|  3.47M|  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
 1018|       |
 1019|  3.47M|  highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
 1020|  3.47M|                                &thresh, &hev, &mask);
 1021|       |
 1022|       |  // lp filter
 1023|  3.47M|  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
 1024|       |
 1025|       |  // flat_mask4
 1026|  3.47M|  flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0]));
 1027|  3.47M|  flat = _mm_max_epi16(abs_p1p0, flat);
 1028|  3.47M|  flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
 1029|       |
 1030|  3.47M|  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
 1031|       |
 1032|  3.47M|  flat = _mm_cmpeq_epi16(flat, zero);
 1033|  3.47M|  flat = _mm_and_si128(flat, mask);
 1034|       |  // replicate for the further "merged variables" usage
 1035|  3.47M|  flat = _mm_unpacklo_epi64(flat, flat);
 1036|       |
 1037|  3.47M|  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
  ------------------
  |  Branch (1037:7): [True: 754k, False: 2.72M]
  ------------------
 1038|   754k|    __m128i workp_a, workp_b, workp_c, workp_shft0, workp_shft1;
 1039|       |    // Added before shift for rounding part of ROUND_POWER_OF_TWO
 1040|       |
 1041|       |    // o*p2
 1042|   754k|    workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
 1043|   754k|    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
 1044|   754k|    workp_c = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
 1045|   754k|    workp_c = _mm_add_epi16(workp_a, workp_c);
 1046|       |
 1047|       |    // o*p1
 1048|   754k|    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
 1049|   754k|    workp_shft0 = _mm_add_epi16(workp_a, workp_b);
 1050|       |
 1051|       |    // o*p0
 1052|   754k|    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2);
 1053|   754k|    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0);
 1054|   754k|    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
 1055|       |
 1056|   754k|    flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft1, workp_shft0), 3);
 1057|       |
 1058|       |    // oq0
 1059|   754k|    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3);
 1060|   754k|    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0);
 1061|   754k|    workp_shft0 = _mm_add_epi16(workp_a, workp_b);
 1062|       |
 1063|       |    // oq1
 1064|   754k|    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3);
 1065|   754k|    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1);
 1066|   754k|    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
 1067|       |
 1068|   754k|    flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3);
 1069|       |
 1070|       |    // oq2
 1071|   754k|    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
 1072|   754k|    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
 1073|   754k|    workp_a = _mm_add_epi16(workp_a, workp_b);
 1074|   754k|    opq2 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_c, workp_a), 3);
 1075|       |
 1076|   754k|    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
 1077|   754k|    q1q0 = _mm_and_si128(flat, flat_q0q1);
 1078|   754k|    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
 1079|       |
 1080|   754k|    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
 1081|   754k|    p1p0 = _mm_and_si128(flat, flat_p1p0);
 1082|   754k|    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
 1083|       |
 1084|   754k|    work_a = _mm_andnot_si128(flat, pq[2]);
 1085|   754k|    *p2 = _mm_and_si128(flat, opq2);
 1086|   754k|    *p2 = _mm_or_si128(work_a, *p2);
 1087|   754k|    *q2 = _mm_srli_si128(*p2, 8);
 1088|   754k|  }
 1089|  3.47M|}
highbd_loopfilter_sse2.c:highbd_lpf_internal_4_sse2:
 1277|  66.4M|    const uint8_t *_thresh, int bd) {
 1278|  66.4M|  __m128i blimit, limit, thresh;
 1279|  66.4M|  __m128i mask, hev;
 1280|  66.4M|  __m128i p1p0, q1q0;
 1281|  66.4M|  __m128i pq[2];
 1282|       |
 1283|  66.4M|  __m128i abs_p1p0;
 1284|       |
 1285|  66.4M|  __m128i t80;
 1286|  66.4M|  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
 1287|       |
 1288|  66.4M|  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
 1289|  66.4M|  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
 1290|       |
 1291|  66.4M|  highbd_hev_filter_mask_x_sse2(pq, 2, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
 1292|  66.4M|                                &thresh, &hev, &mask);
 1293|       |
 1294|  66.4M|  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
 1295|  66.4M|}

aom_dc_predictor_32x32_avx2:
  323|   912k|                                 const uint8_t *above, const uint8_t *left) {
  324|   912k|  const __m256i sum_above = dc_sum_32(above);
  325|   912k|  __m256i sum_left = dc_sum_32(left);
  326|   912k|  sum_left = _mm256_add_epi16(sum_left, sum_above);
  327|   912k|  const __m256i thirtytwo = _mm256_set1_epi16(32);
  328|   912k|  sum_left = _mm256_add_epi16(sum_left, thirtytwo);
  329|   912k|  sum_left = _mm256_srai_epi16(sum_left, 6);
  330|   912k|  const __m256i zero = _mm256_setzero_si256();
  331|   912k|  __m256i row = _mm256_shuffle_epi8(sum_left, zero);
  332|   912k|  row_store_32xh(&row, 32, dst, stride);
  333|   912k|}
aom_dc_top_predictor_32x32_avx2:
  337|  63.3k|                                     const uint8_t *left) {
  338|  63.3k|  __m256i sum = dc_sum_32(above);
  339|  63.3k|  (void)left;
  340|       |
  341|  63.3k|  const __m256i sixteen = _mm256_set1_epi16(16);
  342|  63.3k|  sum = _mm256_add_epi16(sum, sixteen);
  343|  63.3k|  sum = _mm256_srai_epi16(sum, 5);
  344|  63.3k|  const __m256i zero = _mm256_setzero_si256();
  345|  63.3k|  __m256i row = _mm256_shuffle_epi8(sum, zero);
  346|  63.3k|  row_store_32xh(&row, 32, dst, stride);
  347|  63.3k|}
aom_dc_left_predictor_32x32_avx2:
  351|   109k|                                      const uint8_t *left) {
  352|   109k|  __m256i sum = dc_sum_32(left);
  353|   109k|  (void)above;
  354|       |
  355|   109k|  const __m256i sixteen = _mm256_set1_epi16(16);
  356|   109k|  sum = _mm256_add_epi16(sum, sixteen);
  357|   109k|  sum = _mm256_srai_epi16(sum, 5);
  358|   109k|  const __m256i zero = _mm256_setzero_si256();
  359|   109k|  __m256i row = _mm256_shuffle_epi8(sum, zero);
  360|   109k|  row_store_32xh(&row, 32, dst, stride);
  361|   109k|}
aom_dc_128_predictor_32x32_avx2:
  365|  20.6k|                                     const uint8_t *left) {
  366|  20.6k|  (void)above;
  367|  20.6k|  (void)left;
  368|  20.6k|  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
  369|  20.6k|  row_store_32xh(&row, 32, dst, stride);
  370|  20.6k|}
aom_v_predictor_32x32_avx2:
  373|  28.3k|                                const uint8_t *above, const uint8_t *left) {
  374|  28.3k|  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
  375|  28.3k|  (void)left;
  376|  28.3k|  row_store_32xh(&row, 32, dst, stride);
  377|  28.3k|}
aom_h_predictor_32x32_avx2:
  402|   153k|                                const uint8_t *above, const uint8_t *left) {
  403|   153k|  (void)above;
  404|   153k|  const __m256i left_col = _mm256_loadu_si256((__m256i const *)left);
  405|       |
  406|   153k|  __m256i u = _mm256_unpacklo_epi8(left_col, left_col);
  407|       |
  408|   153k|  __m256i v = _mm256_unpacklo_epi8(u, u);
  409|   153k|  h_predictor_32x8line(&v, dst, stride);
  410|   153k|  dst += stride << 2;
  411|       |
  412|   153k|  v = _mm256_unpackhi_epi8(u, u);
  413|   153k|  h_predictor_32x8line(&v, dst, stride);
  414|   153k|  dst += stride << 2;
  415|       |
  416|   153k|  u = _mm256_unpackhi_epi8(left_col, left_col);
  417|       |
  418|   153k|  v = _mm256_unpacklo_epi8(u, u);
  419|   153k|  h_predictor_32x8line(&v, dst, stride);
  420|   153k|  dst += stride << 2;
  421|       |
  422|   153k|  v = _mm256_unpackhi_epi8(u, u);
  423|   153k|  h_predictor_32x8line(&v, dst, stride);
  424|   153k|}
aom_dc_predictor_32x16_avx2:
  429|   147k|                                 const uint8_t *above, const uint8_t *left) {
  430|   147k|  const __m128i top_sum = dc_sum_32_sse2(above);
  431|   147k|  __m128i left_sum = dc_sum_16_sse2(left);
  432|   147k|  left_sum = _mm_add_epi16(top_sum, left_sum);
  433|   147k|  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(left_sum);
  434|   147k|  sum += 24;
  435|   147k|  sum /= 48;
  436|   147k|  const __m256i row = _mm256_set1_epi8((int8_t)sum);
  437|   147k|  row_store_32xh(&row, 16, dst, stride);
  438|   147k|}
aom_dc_predictor_32x64_avx2:
  441|  7.18k|                                 const uint8_t *above, const uint8_t *left) {
  442|  7.18k|  const __m256i sum_above = dc_sum_32(above);
  443|  7.18k|  __m256i sum_left = dc_sum_64(left);
  444|  7.18k|  sum_left = _mm256_add_epi16(sum_left, sum_above);
  445|  7.18k|  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
  446|  7.18k|  sum += 48;
  447|  7.18k|  sum /= 96;
  448|  7.18k|  const __m256i row = _mm256_set1_epi8((int8_t)sum);
  449|  7.18k|  row_store_32xh(&row, 64, dst, stride);
  450|  7.18k|}
aom_dc_predictor_64x64_avx2:
  453|   100k|                                 const uint8_t *above, const uint8_t *left) {
  454|   100k|  const __m256i sum_above = dc_sum_64(above);
  455|   100k|  __m256i sum_left = dc_sum_64(left);
  456|   100k|  sum_left = _mm256_add_epi16(sum_left, sum_above);
  457|   100k|  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
  458|   100k|  sum += 64;
  459|   100k|  sum /= 128;
  460|   100k|  const __m256i row = _mm256_set1_epi8((int8_t)sum);
  461|   100k|  row_store_64xh(&row, 64, dst, stride);
  462|   100k|}
aom_dc_predictor_64x32_avx2:
  465|  15.0k|                                 const uint8_t *above, const uint8_t *left) {
  466|  15.0k|  const __m256i sum_above = dc_sum_64(above);
  467|  15.0k|  __m256i sum_left = dc_sum_32(left);
  468|  15.0k|  sum_left = _mm256_add_epi16(sum_left, sum_above);
  469|  15.0k|  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
  470|  15.0k|  sum += 48;
  471|  15.0k|  sum /= 96;
  472|  15.0k|  const __m256i row = _mm256_set1_epi8((int8_t)sum);
  473|  15.0k|  row_store_64xh(&row, 32, dst, stride);
  474|  15.0k|}
aom_dc_predictor_64x16_avx2:
  478|  56.6k|                                 const uint8_t *above, const uint8_t *left) {
  479|  56.6k|  const __m256i sum_above = dc_sum_64(above);
  480|  56.6k|  __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left));
  481|  56.6k|  sum_left = _mm256_add_epi16(sum_left, sum_above);
  482|  56.6k|  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
  483|  56.6k|  sum += 40;
  484|  56.6k|  sum /= 80;
  485|  56.6k|  const __m256i row = _mm256_set1_epi8((int8_t)sum);
  486|  56.6k|  row_store_64xh(&row, 16, dst, stride);
  487|  56.6k|}
aom_dc_top_predictor_32x16_avx2:
  492|  4.40k|                                     const uint8_t *left) {
  493|  4.40k|  __m256i sum = dc_sum_32(above);
  494|  4.40k|  (void)left;
  495|       |
  496|  4.40k|  const __m256i sixteen = _mm256_set1_epi16(16);
  497|  4.40k|  sum = _mm256_add_epi16(sum, sixteen);
  498|  4.40k|  sum = _mm256_srai_epi16(sum, 5);
  499|  4.40k|  const __m256i zero = _mm256_setzero_si256();
  500|  4.40k|  __m256i row = _mm256_shuffle_epi8(sum, zero);
  501|  4.40k|  row_store_32xh(&row, 16, dst, stride);
  502|  4.40k|}
aom_dc_top_predictor_32x64_avx2:
  506|  1.07k|                                     const uint8_t *left) {
  507|  1.07k|  __m256i sum = dc_sum_32(above);
  508|  1.07k|  (void)left;
  509|       |
  510|  1.07k|  const __m256i sixteen = _mm256_set1_epi16(16);
  511|  1.07k|  sum = _mm256_add_epi16(sum, sixteen);
  512|  1.07k|  sum = _mm256_srai_epi16(sum, 5);
  513|  1.07k|  const __m256i zero = _mm256_setzero_si256();
  514|  1.07k|  __m256i row = _mm256_shuffle_epi8(sum, zero);
  515|  1.07k|  row_store_32xh(&row, 64, dst, stride);
  516|  1.07k|}
aom_dc_top_predictor_64x64_avx2:
  520|  14.2k|                                     const uint8_t *left) {
  521|  14.2k|  __m256i sum = dc_sum_64(above);
  522|  14.2k|  (void)left;
  523|       |
  524|  14.2k|  const __m256i thirtytwo = _mm256_set1_epi16(32);
  525|  14.2k|  sum = _mm256_add_epi16(sum, thirtytwo);
  526|  14.2k|  sum = _mm256_srai_epi16(sum, 6);
  527|  14.2k|  const __m256i zero = _mm256_setzero_si256();
  528|  14.2k|  __m256i row = _mm256_shuffle_epi8(sum, zero);
  529|  14.2k|  row_store_64xh(&row, 64, dst, stride);
  530|  14.2k|}
aom_dc_top_predictor_64x32_avx2:
  534|    421|                                     const uint8_t *left) {
  535|    421|  __m256i sum = dc_sum_64(above);
  536|    421|  (void)left;
  537|       |
  538|    421|  const __m256i thirtytwo = _mm256_set1_epi16(32);
  539|    421|  sum = _mm256_add_epi16(sum, thirtytwo);
  540|    421|  sum = _mm256_srai_epi16(sum, 6);
  541|    421|  const __m256i zero = _mm256_setzero_si256();
  542|    421|  __m256i row = _mm256_shuffle_epi8(sum, zero);
  543|    421|  row_store_64xh(&row, 32, dst, stride);
  544|    421|}
aom_dc_top_predictor_64x16_avx2:
  549|  2.06k|                                     const uint8_t *left) {
  550|  2.06k|  __m256i sum = dc_sum_64(above);
  551|  2.06k|  (void)left;
  552|       |
  553|  2.06k|  const __m256i thirtytwo = _mm256_set1_epi16(32);
  554|  2.06k|  sum = _mm256_add_epi16(sum, thirtytwo);
  555|  2.06k|  sum = _mm256_srai_epi16(sum, 6);
  556|  2.06k|  const __m256i zero = _mm256_setzero_si256();
  557|  2.06k|  __m256i row = _mm256_shuffle_epi8(sum, zero);
  558|  2.06k|  row_store_64xh(&row, 16, dst, stride);
  559|  2.06k|}
aom_dc_left_predictor_32x16_avx2:
  564|  3.91k|                                      const uint8_t *left) {
  565|  3.91k|  __m128i sum = dc_sum_16_sse2(left);
  566|  3.91k|  (void)above;
  567|       |
  568|  3.91k|  const __m128i eight = _mm_set1_epi16(8);
  569|  3.91k|  sum = _mm_add_epi16(sum, eight);
  570|  3.91k|  sum = _mm_srai_epi16(sum, 4);
  571|  3.91k|  const __m128i zero = _mm_setzero_si128();
  572|  3.91k|  const __m128i r = _mm_shuffle_epi8(sum, zero);
  573|  3.91k|  const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
  574|  3.91k|  row_store_32xh(&row, 16, dst, stride);
  575|  3.91k|}
aom_dc_left_predictor_32x64_avx2:
  579|  1.40k|                                      const uint8_t *left) {
  580|  1.40k|  __m256i sum = dc_sum_64(left);
  581|  1.40k|  (void)above;
  582|       |
  583|  1.40k|  const __m256i thirtytwo = _mm256_set1_epi16(32);
  584|  1.40k|  sum = _mm256_add_epi16(sum, thirtytwo);
  585|  1.40k|  sum = _mm256_srai_epi16(sum, 6);
  586|  1.40k|  const __m256i zero = _mm256_setzero_si256();
  587|  1.40k|  __m256i row = _mm256_shuffle_epi8(sum, zero);
  588|  1.40k|  row_store_32xh(&row, 64, dst, stride);
  589|  1.40k|}
aom_dc_left_predictor_64x64_avx2:
  593|  19.2k|                                      const uint8_t *left) {
  594|  19.2k|  __m256i sum = dc_sum_64(left);
  595|  19.2k|  (void)above;
  596|       |
  597|  19.2k|  const __m256i thirtytwo = _mm256_set1_epi16(32);
  598|  19.2k|  sum = _mm256_add_epi16(sum, thirtytwo);
  599|  19.2k|  sum = _mm256_srai_epi16(sum, 6);
  600|  19.2k|  const __m256i zero = _mm256_setzero_si256();
  601|  19.2k|  __m256i row = _mm256_shuffle_epi8(sum, zero);
  602|  19.2k|  row_store_64xh(&row, 64, dst, stride);
  603|  19.2k|}
aom_dc_left_predictor_64x32_avx2:
  607|    838|                                      const uint8_t *left) {
  608|    838|  __m256i sum = dc_sum_32(left);
  609|    838|  (void)above;
  610|       |
  611|    838|  const __m256i sixteen = _mm256_set1_epi16(16);
  612|    838|  sum = _mm256_add_epi16(sum, sixteen);
  613|    838|  sum = _mm256_srai_epi16(sum, 5);
  614|    838|  const __m256i zero = _mm256_setzero_si256();
  615|    838|  __m256i row = _mm256_shuffle_epi8(sum, zero);
  616|    838|  row_store_64xh(&row, 32, dst, stride);
  617|    838|}
aom_dc_left_predictor_64x16_avx2:
  622|    271|                                      const uint8_t *left) {
  623|    271|  __m128i sum = dc_sum_16_sse2(left);
  624|    271|  (void)above;
  625|       |
  626|    271|  const __m128i eight = _mm_set1_epi16(8);
  627|    271|  sum = _mm_add_epi16(sum, eight);
  628|    271|  sum = _mm_srai_epi16(sum, 4);
  629|    271|  const __m128i zero = _mm_setzero_si128();
  630|    271|  const __m128i r = _mm_shuffle_epi8(sum, zero);
  631|    271|  const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
  632|    271|  row_store_64xh(&row, 16, dst, stride);
  633|    271|}
aom_dc_128_predictor_32x16_avx2:
  638|  4.55k|                                     const uint8_t *left) {
  639|  4.55k|  (void)above;
  640|  4.55k|  (void)left;
  641|  4.55k|  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
  642|  4.55k|  row_store_32xh(&row, 16, dst, stride);
  643|  4.55k|}
aom_dc_128_predictor_32x64_avx2:
  647|    917|                                     const uint8_t *left) {
  648|    917|  (void)above;
  649|    917|  (void)left;
  650|    917|  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
  651|    917|  row_store_32xh(&row, 64, dst, stride);
  652|    917|}
aom_dc_128_predictor_64x64_avx2:
  656|  7.84k|                                     const uint8_t *left) {
  657|  7.84k|  (void)above;
  658|  7.84k|  (void)left;
  659|  7.84k|  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
  660|  7.84k|  row_store_64xh(&row, 64, dst, stride);
  661|  7.84k|}
aom_dc_128_predictor_64x32_avx2:
  665|  2.18k|                                     const uint8_t *left) {
  666|  2.18k|  (void)above;
  667|  2.18k|  (void)left;
  668|  2.18k|  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
  669|  2.18k|  row_store_64xh(&row, 32, dst, stride);
  670|  2.18k|}
aom_dc_128_predictor_64x16_avx2:
  675|    451|                                     const uint8_t *left) {
  676|    451|  (void)above;
  677|    451|  (void)left;
  678|    451|  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
  679|    451|  row_store_64xh(&row, 16, dst, stride);
  680|    451|}
aom_v_predictor_32x16_avx2:
  684|  9.83k|                                const uint8_t *above, const uint8_t *left) {
  685|  9.83k|  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
  686|  9.83k|  (void)left;
  687|  9.83k|  row_store_32xh(&row, 16, dst, stride);
  688|  9.83k|}
aom_v_predictor_32x64_avx2:
  691|    576|                                const uint8_t *above, const uint8_t *left) {
  692|    576|  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
  693|    576|  (void)left;
  694|    576|  row_store_32xh(&row, 64, dst, stride);
  695|    576|}
aom_v_predictor_64x64_avx2:
  698|  2.53k|                                const uint8_t *above, const uint8_t *left) {
  699|  2.53k|  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
  700|  2.53k|  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
  701|  2.53k|  (void)left;
  702|  2.53k|  row_store_32x2xh(&row0, &row1, 64, dst, stride);
  703|  2.53k|}
aom_v_predictor_64x32_avx2:
  706|    636|                                const uint8_t *above, const uint8_t *left) {
  707|    636|  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
  708|    636|  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
  709|    636|  (void)left;
  710|    636|  row_store_32x2xh(&row0, &row1, 32, dst, stride);
  711|    636|}
aom_v_predictor_64x16_avx2:
  715|  1.28k|                                const uint8_t *above, const uint8_t *left) {
  716|  1.28k|  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
  717|  1.28k|  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
  718|  1.28k|  (void)left;
  719|  1.28k|  row_store_32x2xh(&row0, &row1, 16, dst, stride);
  720|  1.28k|}
aom_paeth_predictor_16x8_avx2:
  768|  76.4k|                                   const uint8_t *above, const uint8_t *left) {
  769|  76.4k|  __m128i x = _mm_loadl_epi64((const __m128i *)left);
  770|  76.4k|  const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
  771|  76.4k|  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
  772|  76.4k|  __m256i rep = _mm256_set1_epi16((short)0x8000);
  773|  76.4k|  const __m256i one = _mm256_set1_epi16(1);
  774|  76.4k|  const __m256i top = get_top_vector(above);
  775|       |
  776|  76.4k|  int i;
  777|   687k|  for (i = 0; i < 8; ++i) {
  ------------------
  |  Branch (777:15): [True: 611k, False: 76.4k]
  ------------------
  778|   611k|    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
  779|   611k|    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
  780|       |
  781|   611k|    _mm_store_si128((__m128i *)dst, row);
  782|   611k|    dst += stride;
  783|   611k|    rep = _mm256_add_epi16(rep, one);
  784|   611k|  }
  785|  76.4k|}
aom_paeth_predictor_16x16_avx2:
  793|  96.3k|                                    const uint8_t *above, const uint8_t *left) {
  794|  96.3k|  const __m256i l = get_left_vector(left);
  795|  96.3k|  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
  796|  96.3k|  __m256i rep = _mm256_set1_epi16((short)0x8000);
  797|  96.3k|  const __m256i one = _mm256_set1_epi16(1);
  798|  96.3k|  const __m256i top = get_top_vector(above);
  799|       |
  800|  96.3k|  int i;
  801|  1.63M|  for (i = 0; i < 16; ++i) {
  ------------------
  |  Branch (801:15): [True: 1.54M, False: 96.3k]
  ------------------
  802|  1.54M|    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
  803|  1.54M|    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
  804|       |
  805|  1.54M|    _mm_store_si128((__m128i *)dst, row);
  806|  1.54M|    dst += stride;
  807|  1.54M|    rep = _mm256_add_epi16(rep, one);
  808|  1.54M|  }
  809|  96.3k|}
aom_paeth_predictor_16x32_avx2:
  812|   990k|                                    const uint8_t *above, const uint8_t *left) {
  813|   990k|  __m256i l = get_left_vector(left);
  814|   990k|  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
  815|   990k|  __m256i rep = _mm256_set1_epi16((short)0x8000);
  816|   990k|  const __m256i one = _mm256_set1_epi16(1);
  817|   990k|  const __m256i top = get_top_vector(above);
  818|       |
  819|   990k|  int i;
  820|  16.8M|  for (i = 0; i < 16; ++i) {
  ------------------
  |  Branch (820:15): [True: 15.8M, False: 990k]
  ------------------
  821|  15.8M|    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
  822|  15.8M|    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
  823|       |
  824|  15.8M|    _mm_store_si128((__m128i *)dst, row);
  825|  15.8M|    dst += stride;
  826|  15.8M|    rep = _mm256_add_epi16(rep, one);
  827|  15.8M|  }
  828|       |
  829|   990k|  l = get_left_vector(left + 16);
  830|   990k|  rep = _mm256_set1_epi16((short)0x8000);
  831|  16.8M|  for (i = 0; i < 16; ++i) {
  ------------------
  |  Branch (831:15): [True: 15.8M, False: 990k]
  ------------------
  832|  15.8M|    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
  833|  15.8M|    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
  834|       |
  835|  15.8M|    _mm_store_si128((__m128i *)dst, row);
  836|  15.8M|    dst += stride;
  837|  15.8M|    rep = _mm256_add_epi16(rep, one);
  838|  15.8M|  }
  839|   990k|}
aom_paeth_predictor_16x64_avx2:
  843|   246k|                                    const uint8_t *above, const uint8_t *left) {
  844|   246k|  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
  845|   246k|  const __m256i one = _mm256_set1_epi16(1);
  846|   246k|  const __m256i top = get_top_vector(above);
  847|       |
  848|  1.23M|  for (int j = 0; j < 4; ++j) {
  ------------------
  |  Branch (848:19): [True: 984k, False: 246k]
  ------------------
  849|   984k|    const __m256i l = get_left_vector(left + j * 16);
  850|   984k|    __m256i rep = _mm256_set1_epi16((short)0x8000);
  851|  16.7M|    for (int i = 0; i < 16; ++i) {
  ------------------
  |  Branch (851:21): [True: 15.7M, False: 984k]
  ------------------
  852|  15.7M|      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
  853|  15.7M|      const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
  854|       |
  855|  15.7M|      _mm_store_si128((__m128i *)dst, row);
  856|  15.7M|      dst += stride;
  857|  15.7M|      rep = _mm256_add_epi16(rep, one);
  858|  15.7M|    }
  859|   984k|  }
  860|   246k|}
aom_paeth_predictor_32x16_avx2:
  879|  29.0k|                                    const uint8_t *above, const uint8_t *left) {
  880|  29.0k|  const __m256i l = get_left_vector(left);
  881|  29.0k|  const __m256i t0 = get_top_vector(above);
  882|  29.0k|  const __m256i t1 = get_top_vector(above + 16);
  883|  29.0k|  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
  884|  29.0k|  __m256i rep = _mm256_set1_epi16((short)0x8000);
  885|  29.0k|  const __m256i one = _mm256_set1_epi16(1);
  886|       |
  887|  29.0k|  int i;
  888|   493k|  for (i = 0; i < 16; ++i) {
  ------------------
  |  Branch (888:15): [True: 464k, False: 29.0k]
  ------------------
  889|   464k|    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
  890|       |
  891|   464k|    const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl);
  892|       |
  893|   464k|    _mm256_storeu_si256((__m256i *)dst, r);
  894|       |
  895|   464k|    dst += stride;
  896|   464k|    rep = _mm256_add_epi16(rep, one);
  897|   464k|  }
  898|  29.0k|}
aom_paeth_predictor_32x32_avx2:
  901|   268k|                                    const uint8_t *above, const uint8_t *left) {
  902|   268k|  __m256i l = get_left_vector(left);
  903|   268k|  const __m256i t0 = get_top_vector(above);
  904|   268k|  const __m256i t1 = get_top_vector(above + 16);
  905|   268k|  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
  906|   268k|  __m256i rep = _mm256_set1_epi16((short)0x8000);
  907|   268k|  const __m256i one = _mm256_set1_epi16(1);
  908|       |
  909|   268k|  int i;
  910|  4.56M|  for (i = 0; i < 16; ++i) {
  ------------------
  |  Branch (910:15): [True: 4.30M, False: 268k]
  ------------------
  911|  4.30M|    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
  912|       |
  913|  4.30M|    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
  914|  4.30M|    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
  915|       |
  916|  4.30M|    _mm_store_si128((__m128i *)dst, r0);
  917|  4.30M|    _mm_store_si128((__m128i *)(dst + 16), r1);
  918|       |
  919|  4.30M|    dst += stride;
  920|  4.30M|    rep = _mm256_add_epi16(rep, one);
  921|  4.30M|  }
  922|       |
  923|   268k|  l = get_left_vector(left + 16);
  924|   268k|  rep = _mm256_set1_epi16((short)0x8000);
  925|  4.56M|  for (i = 0; i < 16; ++i) {
  ------------------
  |  Branch (925:15): [True: 4.30M, False: 268k]
  ------------------
  926|  4.30M|    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
  927|       |
  928|  4.30M|    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
  929|  4.30M|    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
  930|       |
  931|  4.30M|    _mm_store_si128((__m128i *)dst, r0);
  932|  4.30M|    _mm_store_si128((__m128i *)(dst + 16), r1);
  933|       |
  934|  4.30M|    dst += stride;
  935|  4.30M|    rep = _mm256_add_epi16(rep, one);
  936|  4.30M|  }
  937|   268k|}
aom_paeth_predictor_32x64_avx2:
  940|  5.11k|                                    const uint8_t *above, const uint8_t *left) {
  941|  5.11k|  const __m256i t0 = get_top_vector(above);
  942|  5.11k|  const __m256i t1 = get_top_vector(above + 16);
  943|  5.11k|  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
  944|  5.11k|  const __m256i one = _mm256_set1_epi16(1);
  945|       |
  946|  5.11k|  int i, j;
  947|  25.5k|  for (j = 0; j < 4; ++j) {
  ------------------
  |  Branch (947:15): [True: 20.4k, False: 5.11k]
  ------------------
  948|  20.4k|    const __m256i l = get_left_vector(left + j * 16);
  949|  20.4k|    __m256i rep = _mm256_set1_epi16((short)0x8000);
  950|   347k|    for (i = 0; i < 16; ++i) {
  ------------------
  |  Branch (950:17): [True: 327k, False: 20.4k]
  ------------------
  951|   327k|      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
  952|       |
  953|   327k|      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
  954|   327k|      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
  955|       |
  956|   327k|      _mm_store_si128((__m128i *)dst, r0);
  957|   327k|      _mm_store_si128((__m128i *)(dst + 16), r1);
  958|       |
  959|   327k|      dst += stride;
  960|   327k|      rep = _mm256_add_epi16(rep, one);
  961|   327k|    }
  962|  20.4k|  }
  963|  5.11k|}
aom_paeth_predictor_64x32_avx2:
  966|  5.00k|                                    const uint8_t *above, const uint8_t *left) {
  967|  5.00k|  const __m256i t0 = get_top_vector(above);
  968|  5.00k|  const __m256i t1 = get_top_vector(above + 16);
  969|  5.00k|  const __m256i t2 = get_top_vector(above + 32);
  970|  5.00k|  const __m256i t3 = get_top_vector(above + 48);
  971|  5.00k|  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
  972|  5.00k|  const __m256i one = _mm256_set1_epi16(1);
  973|       |
  974|  5.00k|  int i, j;
  975|  15.0k|  for (j = 0; j < 2; ++j) {
  ------------------
  |  Branch (975:15): [True: 10.0k, False: 5.00k]
  ------------------
  976|  10.0k|    const __m256i l = get_left_vector(left + j * 16);
  977|  10.0k|    __m256i rep = _mm256_set1_epi16((short)0x8000);
  978|   170k|    for (i = 0; i < 16; ++i) {
  ------------------
  |  Branch (978:17): [True: 160k, False: 10.0k]
  ------------------
  979|   160k|      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
  980|       |
  981|   160k|      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
  982|   160k|      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
  983|   160k|      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
  984|   160k|      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
  985|       |
  986|   160k|      _mm_store_si128((__m128i *)dst, r0);
  987|   160k|      _mm_store_si128((__m128i *)(dst + 16), r1);
  988|   160k|      _mm_store_si128((__m128i *)(dst + 32), r2);
  989|   160k|      _mm_store_si128((__m128i *)(dst + 48), r3);
  990|       |
  991|   160k|      dst += stride;
  992|   160k|      rep = _mm256_add_epi16(rep, one);
  993|   160k|    }
  994|  10.0k|  }
  995|  5.00k|}
aom_paeth_predictor_64x64_avx2:
  998|  39.5k|                                    const uint8_t *above, const uint8_t *left) {
  999|  39.5k|  const __m256i t0 = get_top_vector(above);
 1000|  39.5k|  const __m256i t1 = get_top_vector(above + 16);
 1001|  39.5k|  const __m256i t2 = get_top_vector(above + 32);
 1002|  39.5k|  const __m256i t3 = get_top_vector(above + 48);
 1003|  39.5k|  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
 1004|  39.5k|  const __m256i one = _mm256_set1_epi16(1);
 1005|       |
 1006|  39.5k|  int i, j;
 1007|   197k|  for (j = 0; j < 4; ++j) {
  ------------------
  |  Branch (1007:15): [True: 158k, False: 39.5k]
  ------------------
 1008|   158k|    const __m256i l = get_left_vector(left + j * 16);
 1009|   158k|    __m256i rep = _mm256_set1_epi16((short)0x8000);
 1010|  2.69M|    for (i = 0; i < 16; ++i) {
  ------------------
  |  Branch (1010:17): [True: 2.53M, False: 158k]
  ------------------
 1011|  2.53M|      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
 1012|       |
 1013|  2.53M|      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
 1014|  2.53M|      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
 1015|  2.53M|      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
 1016|  2.53M|      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
 1017|       |
 1018|  2.53M|      _mm_store_si128((__m128i *)dst, r0);
 1019|  2.53M|      _mm_store_si128((__m128i *)(dst + 16), r1);
 1020|  2.53M|      _mm_store_si128((__m128i *)(dst + 32), r2);
 1021|  2.53M|      _mm_store_si128((__m128i *)(dst + 48), r3);
 1022|       |
 1023|  2.53M|      dst += stride;
 1024|  2.53M|      rep = _mm256_add_epi16(rep, one);
 1025|  2.53M|    }
 1026|   158k|  }
 1027|  39.5k|}
aom_paeth_predictor_64x16_avx2:
 1031|  9.66k|                                    const uint8_t *above, const uint8_t *left) {
 1032|  9.66k|  const __m256i t0 = get_top_vector(above);
 1033|  9.66k|  const __m256i t1 = get_top_vector(above + 16);
 1034|  9.66k|  const __m256i t2 = get_top_vector(above + 32);
 1035|  9.66k|  const __m256i t3 = get_top_vector(above + 48);
 1036|  9.66k|  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
 1037|  9.66k|  const __m256i one = _mm256_set1_epi16(1);
 1038|       |
 1039|  9.66k|  int i;
 1040|  9.66k|  const __m256i l = get_left_vector(left);
 1041|  9.66k|  __m256i rep = _mm256_set1_epi16((short)0x8000);
 1042|   164k|  for (i = 0; i < 16; ++i) {
  ------------------
  |  Branch (1042:15): [True: 154k, False: 9.66k]
  ------------------
 1043|   154k|    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
 1044|       |
 1045|   154k|    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
 1046|   154k|    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
 1047|   154k|    const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
 1048|   154k|    const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
 1049|       |
 1050|   154k|    _mm_store_si128((__m128i *)dst, r0);
 1051|   154k|    _mm_store_si128((__m128i *)(dst + 16), r1);
 1052|   154k|    _mm_store_si128((__m128i *)(dst + 32), r2);
 1053|   154k|    _mm_store_si128((__m128i *)(dst + 48), r3);
 1054|       |
 1055|   154k|    dst += stride;
 1056|   154k|    rep = _mm256_add_epi16(rep, one);
 1057|   154k|  }
 1058|  9.66k|}
av1_highbd_dr_prediction_z1_avx2:
 1921|   765k|                                      int dx, int dy, int bd) {
 1922|   765k|  (void)left;
 1923|   765k|  (void)dy;
 1924|       |
 1925|   765k|  switch (bw) {
 1926|   183k|    case 4:
  ------------------
  |  Branch (1926:5): [True: 183k, False: 582k]
  ------------------
 1927|   183k|      highbd_dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above,
 1928|   183k|                                       dx, bd);
 1929|   183k|      break;
 1930|   269k|    case 8:
  ------------------
  |  Branch (1930:5): [True: 269k, False: 496k]
  ------------------
 1931|   269k|      highbd_dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above,
 1932|   269k|                                       dx, bd);
 1933|   269k|      break;
 1934|   209k|    case 16:
  ------------------
  |  Branch (1934:5): [True: 209k, False: 555k]
  ------------------
 1935|   209k|      highbd_dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above,
 1936|   209k|                                        dx, bd);
 1937|   209k|      break;
 1938|  78.0k|    case 32:
  ------------------
  |  Branch (1938:5): [True: 78.0k, False: 687k]
  ------------------
 1939|  78.0k|      highbd_dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above,
 1940|  78.0k|                                        dx, bd);
 1941|  78.0k|      break;
 1942|  25.2k|    case 64:
  ------------------
  |  Branch (1942:5): [True: 25.2k, False: 740k]
  ------------------
 1943|  25.2k|      if (bd < 12) {
  ------------------
  |  Branch (1943:11): [True: 11.9k, False: 13.3k]
  ------------------
 1944|  11.9k|        highbd_dr_prediction_z1_64xN_avx2(bh, dst, stride, above,
 1945|  11.9k|                                          upsample_above, dx);
 1946|  13.3k|      } else {
 1947|  13.3k|        highbd_dr_prediction_32bit_z1_64xN_avx2(bh, dst, stride, above,
 1948|  13.3k|                                                upsample_above, dx);
 1949|  13.3k|      }
 1950|  25.2k|      break;
 1951|      0|    default: break;
  ------------------
  |  Branch (1951:5): [True: 0, False: 765k]
  ------------------
 1952|   765k|  }
 1953|   765k|  return;
 1954|   765k|}
av1_highbd_dr_prediction_z2_avx2:
 2874|  1.63M|                                      int bd) {
 2875|  1.63M|  (void)bd;
 2876|  1.63M|  assert(dx > 0);
 2877|  1.63M|  assert(dy > 0);
 2878|  1.63M|  switch (bw) {
 2879|   504k|    case 4:
  ------------------
  |  Branch (2879:5): [True: 504k, False: 1.13M]
  ------------------
 2880|   504k|      if (bd < 12) {
  ------------------
  |  Branch (2880:11): [True: 213k, False: 290k]
  ------------------
 2881|   213k|        highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left,
 2882|   213k|                                         upsample_above, upsample_left, dx, dy);
 2883|   290k|      } else {
 2884|   290k|        highbd_dr_prediction_32bit_z2_Nx4_avx2(bh, dst, stride, above, left,
 2885|   290k|                                               upsample_above, upsample_left,
 2886|   290k|                                               dx, dy);
 2887|   290k|      }
 2888|   504k|      break;
 2889|   534k|    case 8:
  ------------------
  |  Branch (2889:5): [True: 534k, False: 1.10M]
  ------------------
 2890|   534k|      if (bd < 12) {
  ------------------
  |  Branch (2890:11): [True: 252k, False: 282k]
  ------------------
 2891|   252k|        highbd_dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left,
 2892|   252k|                                         upsample_above, upsample_left, dx, dy);
 2893|   282k|      } else {
 2894|   282k|        highbd_dr_prediction_32bit_z2_Nx8_avx2(bh, dst, stride, above, left,
 2895|   282k|                                               upsample_above, upsample_left,
 2896|   282k|                                               dx, dy);
 2897|   282k|      }
 2898|   534k|      break;
 2899|   598k|    default:
  ------------------
  |  Branch (2899:5): [True: 598k, False: 1.03M]
  ------------------
 2900|   598k|      if (bd < 12) {
  ------------------
  |  Branch (2900:11): [True: 420k, False: 177k]
  ------------------
 2901|   420k|        highbd_dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
 2902|   420k|                                         upsample_above, upsample_left, dx, dy);
 2903|   420k|      } else {
 2904|   177k|        highbd_dr_prediction_32bit_z2_HxW_avx2(bh, bw, dst, stride, above, left,
 2905|   177k|                                               upsample_above, upsample_left,
 2906|   177k|                                               dx, dy);
 2907|   177k|      }
 2908|   598k|      break;
 2909|  1.63M|  }
 2910|  1.63M|}
av1_highbd_dr_prediction_z3_avx2:
 3342|  1.14M|                                      int dx, int dy, int bd) {
 3343|  1.14M|  (void)above;
 3344|  1.14M|  (void)dx;
 3345|       |
 3346|  1.14M|  assert(dx == 1);
 3347|  1.14M|  assert(dy > 0);
 3348|  1.14M|  if (bw == bh) {
  ------------------
  |  Branch (3348:7): [True: 674k, False: 471k]
  ------------------
 3349|   674k|    switch (bw) {
  ------------------
  |  Branch (3349:13): [True: 18.4E, False: 674k]
  ------------------
 3350|   225k|      case 4:
  ------------------
  |  Branch (3350:7): [True: 225k, False: 448k]
  ------------------
 3351|   225k|        highbd_dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy,
 3352|   225k|                                         bd);
 3353|   225k|        break;
 3354|   210k|      case 8:
  ------------------
  |  Branch (3354:7): [True: 210k, False: 463k]
  ------------------
 3355|   210k|        highbd_dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy,
 3356|   210k|                                         bd);
 3357|   210k|        break;
 3358|   112k|      case 16:
  ------------------
  |  Branch (3358:7): [True: 112k, False: 562k]
  ------------------
 3359|   112k|        highbd_dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy,
 3360|   112k|                                           bd);
 3361|   112k|        break;
 3362|  94.8k|      case 32:
  ------------------
  |  Branch (3362:7): [True: 94.8k, False: 579k]
  ------------------
 3363|  94.8k|        highbd_dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy,
 3364|  94.8k|                                           bd);
 3365|  94.8k|        break;
 3366|  30.9k|      case 64:
  ------------------
  |  Branch (3366:7): [True: 30.9k, False: 643k]
  ------------------
 3367|  30.9k|        highbd_dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy,
 3368|  30.9k|                                           bd);
 3369|  30.9k|        break;
 3370|   674k|    }
 3371|   674k|  } else {
 3372|   471k|    if (bw < bh) {
  ------------------
  |  Branch (3372:9): [True: 148k, False: 323k]
  ------------------
 3373|   148k|      if (bw + bw == bh) {
  ------------------
  |  Branch (3373:11): [True: 106k, False: 42.1k]
  ------------------
 3374|   106k|        switch (bw) {
  ------------------
  |  Branch (3374:17): [True: 0, False: 106k]
  ------------------
 3375|  35.7k|          case 4:
  ------------------
  |  Branch (3375:11): [True: 35.7k, False: 70.6k]
  ------------------
 3376|  35.7k|            highbd_dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left,
 3377|  35.7k|                                             dy, bd);
 3378|  35.7k|            break;
 3379|  42.5k|          case 8:
  ------------------
  |  Branch (3379:11): [True: 42.5k, False: 63.8k]
  ------------------
 3380|  42.5k|            highbd_dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left,
 3381|  42.5k|                                              dy, bd);
 3382|  42.5k|            break;
 3383|  25.9k|          case 16:
  ------------------
  |  Branch (3383:11): [True: 25.9k, False: 80.4k]
  ------------------
 3384|  25.9k|            highbd_dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left,
 3385|  25.9k|                                               dy, bd);
 3386|  25.9k|            break;
 3387|  2.18k|          case 32:
  ------------------
  |  Branch (3387:11): [True: 2.18k, False: 104k]
  ------------------
 3388|  2.18k|            highbd_dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left,
 3389|  2.18k|                                               dy, bd);
 3390|  2.18k|            break;
 3391|   106k|        }
 3392|   106k|      } else {
 3393|  42.1k|        switch (bw) {
  ------------------
  |  Branch (3393:17): [True: 0, False: 42.1k]
  ------------------
 3394|      0|#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
 3395|  24.1k|          case 4:
  ------------------
  |  Branch (3395:11): [True: 24.1k, False: 18.0k]
  ------------------
 3396|  24.1k|            highbd_dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left,
 3397|  24.1k|                                              dy, bd);
 3398|  24.1k|            break;
 3399|  13.1k|          case 8:
  ------------------
  |  Branch (3399:11): [True: 13.1k, False: 29.0k]
  ------------------
 3400|  13.1k|            highbd_dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left,
 3401|  13.1k|                                              dy, bd);
 3402|  13.1k|            break;
 3403|  4.87k|          case 16:
  ------------------
  |  Branch (3403:11): [True: 4.87k, False: 37.2k]
  ------------------
 3404|  4.87k|            highbd_dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left,
 3405|  4.87k|                                               dy, bd);
 3406|  4.87k|            break;
 3407|  42.1k|#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
 3408|  42.1k|        }
 3409|  42.1k|      }
 3410|   323k|    } else {
 3411|   323k|      if (bh + bh == bw) {
  ------------------
  |  Branch (3411:11): [True: 185k, False: 138k]
  ------------------
 3412|   185k|        switch (bh) {
  ------------------
  |  Branch (3412:17): [True: 18.4E, False: 185k]
  ------------------
 3413|  66.0k|          case 4:
  ------------------
  |  Branch (3413:11): [True: 66.0k, False: 119k]
  ------------------
 3414|  66.0k|            highbd_dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left,
 3415|  66.0k|                                             dy, bd);
 3416|  66.0k|            break;
 3417|  85.7k|          case 8:
  ------------------
  |  Branch (3417:11): [True: 85.7k, False: 99.3k]
  ------------------
 3418|  85.7k|            highbd_dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left,
 3419|  85.7k|                                              dy, bd);
 3420|  85.7k|            break;
 3421|  30.4k|          case 16:
  ------------------
  |  Branch (3421:11): [True: 30.4k, False: 154k]
  ------------------
 3422|  30.4k|            highbd_dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left,
 3423|  30.4k|                                               dy, bd);
 3424|  30.4k|            break;
 3425|  2.94k|          case 32:
  ------------------
  |  Branch (3425:11): [True: 2.94k, False: 182k]
  ------------------
 3426|  2.94k|            highbd_dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left,
 3427|  2.94k|                                               dy, bd);
 3428|  2.94k|            break;
 3429|   185k|        }
 3430|   185k|      } else {
 3431|   138k|        switch (bh) {
  ------------------
  |  Branch (3431:17): [True: 18.4E, False: 138k]
  ------------------
 3432|      0|#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
 3433|  63.8k|          case 4:
  ------------------
  |  Branch (3433:11): [True: 63.8k, False: 74.3k]
  ------------------
 3434|  63.8k|            highbd_dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left,
 3435|  63.8k|                                              dy, bd);
 3436|  63.8k|            break;
 3437|  57.0k|          case 8:
  ------------------
  |  Branch (3437:11): [True: 57.0k, False: 81.1k]
  ------------------
 3438|  57.0k|            highbd_dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left,
 3439|  57.0k|                                              dy, bd);
 3440|  57.0k|            break;
 3441|  17.3k|          case 16:
  ------------------
  |  Branch (3441:11): [True: 17.3k, False: 120k]
  ------------------
 3442|  17.3k|            highbd_dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left,
 3443|  17.3k|                                               dy, bd);
 3444|  17.3k|            break;
 3445|   138k|#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
 3446|   138k|        }
 3447|   138k|      }
 3448|   323k|    }
 3449|   471k|  }
 3450|  1.14M|  return;
 3451|  1.14M|}
av1_dr_prediction_z1_avx2:
 3818|   469k|                               int upsample_above, int dx, int dy) {
 3819|   469k|  (void)left;
 3820|   469k|  (void)dy;
 3821|   469k|  switch (bw) {
 3822|   138k|    case 4:
  ------------------
  |  Branch (3822:5): [True: 138k, False: 330k]
  ------------------
 3823|   138k|      dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, dx);
 3824|   138k|      break;
 3825|   126k|    case 8:
  ------------------
  |  Branch (3825:5): [True: 126k, False: 342k]
  ------------------
 3826|   126k|      dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, dx);
 3827|   126k|      break;
 3828|   115k|    case 16:
  ------------------
  |  Branch (3828:5): [True: 115k, False: 353k]
  ------------------
 3829|   115k|      dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, dx);
 3830|   115k|      break;
 3831|  73.1k|    case 32:
  ------------------
  |  Branch (3831:5): [True: 73.1k, False: 395k]
  ------------------
 3832|  73.1k|      dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, dx);
 3833|  73.1k|      break;
 3834|  14.5k|    case 64:
  ------------------
  |  Branch (3834:5): [True: 14.5k, False: 454k]
  ------------------
 3835|  14.5k|      dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx);
 3836|  14.5k|      break;
 3837|      0|    default: break;
  ------------------
  |  Branch (3837:5): [True: 0, False: 469k]
  ------------------
 3838|   469k|  }
 3839|   469k|  return;
 3840|   469k|}
av1_dr_prediction_z2_avx2:
 4247|  1.28M|                               int dy) {
 4248|  1.28M|  assert(dx > 0);
 4249|  1.28M|  assert(dy > 0);
 4250|  1.28M|  switch (bw) {
 4251|   576k|    case 4:
  ------------------
  |  Branch (4251:5): [True: 576k, False: 711k]
  ------------------
 4252|   576k|      dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above,
 4253|   576k|                                upsample_left, dx, dy);
 4254|   576k|      break;
 4255|   264k|    case 8:
  ------------------
  |  Branch (4255:5): [True: 264k, False: 1.02M]
  ------------------
 4256|   264k|      dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above,
 4257|   264k|                                upsample_left, dx, dy);
 4258|   264k|      break;
 4259|   446k|    default:
  ------------------
  |  Branch (4259:5): [True: 446k, False: 841k]
  ------------------
 4260|   446k|      dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
 4261|   446k|                                upsample_above, upsample_left, dx, dy);
 4262|   446k|      break;
 4263|  1.28M|  }
 4264|  1.28M|  return;
 4265|  1.28M|}
av1_dr_prediction_z3_avx2:
 4652|   741k|                               int upsample_left, int dx, int dy) {
 4653|   741k|  (void)above;
 4654|   741k|  (void)dx;
 4655|   741k|  assert(dx == 1);
 4656|   741k|  assert(dy > 0);
 4657|       |
 4658|   741k|  if (bw == bh) {
  ------------------
  |  Branch (4658:7): [True: 404k, False: 336k]
  ------------------
 4659|   404k|    switch (bw) {
  ------------------
  |  Branch (4659:13): [True: 18.4E, False: 404k]
  ------------------
 4660|   115k|      case 4:
  ------------------
  |  Branch (4660:7): [True: 115k, False: 289k]
  ------------------
 4661|   115k|        dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy);
 4662|   115k|        break;
 4663|  98.0k|      case 8:
  ------------------
  |  Branch (4663:7): [True: 98.0k, False: 306k]
  ------------------
 4664|  98.0k|        dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy);
 4665|  98.0k|        break;
 4666|  87.1k|      case 16:
  ------------------
  |  Branch (4666:7): [True: 87.1k, False: 317k]
  ------------------
 4667|  87.1k|        dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy);
 4668|  87.1k|        break;
 4669|  81.0k|      case 32:
  ------------------
  |  Branch (4669:7): [True: 81.0k, False: 323k]
  ------------------
 4670|  81.0k|        dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy);
 4671|  81.0k|        break;
 4672|  23.0k|      case 64:
  ------------------
  |  Branch (4672:7): [True: 23.0k, False: 381k]
  ------------------
 4673|  23.0k|        dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy);
 4674|  23.0k|        break;
 4675|   404k|    }
 4676|   404k|  } else {
 4677|   336k|    if (bw < bh) {
  ------------------
  |  Branch (4677:9): [True: 105k, False: 231k]
  ------------------
 4678|   105k|      if (bw + bw == bh) {
  ------------------
  |  Branch (4678:11): [True: 73.0k, False: 32.0k]
  ------------------
 4679|  73.0k|        switch (bw) {
  ------------------
  |  Branch (4679:17): [True: 0, False: 73.0k]
  ------------------
 4680|  23.9k|          case 4:
  ------------------
  |  Branch (4680:11): [True: 23.9k, False: 49.0k]
  ------------------
 4681|  23.9k|            dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, dy);
 4682|  23.9k|            break;
 4683|  26.2k|          case 8:
  ------------------
  |  Branch (4683:11): [True: 26.2k, False: 46.8k]
  ------------------
 4684|  26.2k|            dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, dy);
 4685|  26.2k|            break;
 4686|  21.2k|          case 16:
  ------------------
  |  Branch (4686:11): [True: 21.2k, False: 51.8k]
  ------------------
 4687|  21.2k|            dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, dy);
 4688|  21.2k|            break;
 4689|  1.55k|          case 32:
  ------------------
  |  Branch (4689:11): [True: 1.55k, False: 71.5k]
  ------------------
 4690|  1.55k|            dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, dy);
 4691|  1.55k|            break;
 4692|  73.0k|        }
 4693|  73.0k|      } else {
 4694|  32.0k|        switch (bw) {
  ------------------
  |  Branch (4694:17): [True: 0, False: 32.0k]
  ------------------
 4695|      0|#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
 4696|  17.3k|          case 4:
  ------------------
  |  Branch (4696:11): [True: 17.3k, False: 14.6k]
  ------------------
 4697|  17.3k|            dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, dy);
 4698|  17.3k|            break;
 4699|  10.5k|          case 8:
  ------------------
  |  Branch (4699:11): [True: 10.5k, False: 21.5k]
  ------------------
 4700|  10.5k|            dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, dy);
 4701|  10.5k|            break;
 4702|  4.15k|          case 16:
  ------------------
  |  Branch (4702:11): [True: 4.15k, False: 27.9k]
  ------------------
 4703|  4.15k|            dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, dy);
 4704|  4.15k|            break;
 4705|  32.0k|#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
 4706|  32.0k|        }
 4707|  32.0k|      }
 4708|   231k|    } else {
 4709|   231k|      if (bh + bh == bw) {
  ------------------
  |  Branch (4709:11): [True: 114k, False: 116k]
  ------------------
 4710|   114k|        switch (bh) {
  ------------------
  |  Branch (4710:17): [True: 18.4E, False: 114k]
  ------------------
 4711|  40.7k|          case 4:
  ------------------
  |  Branch (4711:11): [True: 40.7k, False: 74.2k]
  ------------------
 4712|  40.7k|            dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, dy);
 4713|  40.7k|            break;
 4714|  50.2k|          case 8:
  ------------------
  |  Branch (4714:11): [True: 50.2k, False: 64.6k]
  ------------------
 4715|  50.2k|            dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, dy);
 4716|  50.2k|            break;
 4717|  21.0k|          case 16:
  ------------------
  |  Branch (4717:11): [True: 21.0k, False: 93.8k]
  ------------------
 4718|  21.0k|            dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, dy);
 4719|  21.0k|            break;
 4720|  2.90k|          case 32:
  ------------------
  |  Branch (4720:11): [True: 2.90k, False: 112k]
  ------------------
 4721|  2.90k|            dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, dy);
 4722|  2.90k|            break;
 4723|   114k|        }
 4724|   116k|      } else {
 4725|   116k|        switch (bh) {
  ------------------
  |  Branch (4725:17): [True: 18.4E, False: 116k]
  ------------------
 4726|      0|#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
 4727|  55.1k|          case 4:
  ------------------
  |  Branch (4727:11): [True: 55.1k, False: 61.4k]
  ------------------
 4728|  55.1k|            dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, dy);
 4729|  55.1k|            break;
 4730|  45.2k|          case 8:
  ------------------
  |  Branch (4730:11): [True: 45.2k, False: 71.3k]
  ------------------
 4731|  45.2k|            dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, dy);
 4732|  45.2k|            break;
 4733|  16.3k|          case 16:
  ------------------
  |  Branch (4733:11): [True: 16.3k, False: 100k]
  ------------------
 4734|  16.3k|            dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, dy);
 4735|  16.3k|            break;
 4736|   116k|#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
 4737|   116k|        }
 4738|   116k|      }
 4739|   231k|    }
 4740|   336k|  }
 4741|   741k|}
intrapred_avx2.c:dc_sum_32:
   32|  2.02M|static inline __m256i dc_sum_32(const uint8_t *ref) {
   33|  2.02M|  const __m256i x = _mm256_loadu_si256((const __m256i *)ref);
   34|  2.02M|  const __m256i zero = _mm256_setzero_si256();
   35|  2.02M|  __m256i y = _mm256_sad_epu8(x, zero);
   36|  2.02M|  __m256i u = _mm256_permute2x128_si256(y, y, 1);
   37|  2.02M|  y = _mm256_add_epi64(u, y);
   38|  2.02M|  u = _mm256_unpackhi_epi64(y, y);
   39|  2.02M|  return _mm256_add_epi16(y, u);
   40|  2.02M|}
intrapred_avx2.c:row_store_32xh:
   43|  1.31M|                                  ptrdiff_t stride) {
   44|  41.0M|  for (int i = 0; i < height; ++i) {
  ------------------
  |  Branch (44:19): [True: 39.7M, False: 1.31M]
  ------------------
   45|  39.7M|    _mm256_storeu_si256((__m256i *)dst, *r);
   46|  39.7M|    dst += stride;
   47|  39.7M|  }
   48|  1.31M|}
intrapred_avx2.c:h_predictor_32x8line:
  384|   613k|                                        ptrdiff_t stride) {
  385|   613k|  __m256i t[4];
  386|   613k|  __m256i m = _mm256_setzero_si256();
  387|   613k|  const __m256i inc = _mm256_set1_epi8(4);
  388|   613k|  int i;
  389|       |
  390|  3.06M|  for (i = 0; i < 4; i++) {
  ------------------
  |  Branch (390:15): [True: 2.45M, False: 613k]
  ------------------
  391|  2.45M|    t[i] = _mm256_shuffle_epi8(*row, m);
  392|  2.45M|    __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0);
  393|  2.45M|    __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11);
  394|  2.45M|    _mm256_storeu_si256((__m256i *)dst, r0);
  395|  2.45M|    _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1);
  396|  2.45M|    dst += stride;
  397|  2.45M|    m = _mm256_add_epi8(m, inc);
  398|  2.45M|  }
  399|   613k|}
intrapred_avx2.c:dc_sum_64:
   19|   317k|static inline __m256i dc_sum_64(const uint8_t *ref) {
   20|   317k|  const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
   21|   317k|  const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
   22|   317k|  const __m256i zero = _mm256_setzero_si256();
   23|   317k|  __m256i y0 = _mm256_sad_epu8(x0, zero);
   24|   317k|  __m256i y1 = _mm256_sad_epu8(x1, zero);
   25|   317k|  y0 = _mm256_add_epi64(y0, y1);
   26|   317k|  __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1);
   27|   317k|  y0 = _mm256_add_epi64(u0, y0);
   28|   317k|  u0 = _mm256_unpackhi_epi64(y0, y0);
   29|   317k|  return _mm256_add_epi16(y0, u0);
   30|   317k|}
intrapred_avx2.c:row_store_64xh:
   61|   219k|                                  ptrdiff_t stride) {
   62|  10.8M|  for (int i = 0; i < height; ++i) {
  ------------------
  |  Branch (62:19): [True: 10.6M, False: 219k]
  ------------------
   63|  10.6M|    _mm256_storeu_si256((__m256i *)dst, *r);
   64|  10.6M|    _mm256_storeu_si256((__m256i *)(dst + 32), *r);
   65|  10.6M|    dst += stride;
   66|  10.6M|  }
   67|   219k|}
intrapred_avx2.c:row_store_32x2xh:
   52|  4.45k|                                    ptrdiff_t stride) {
   53|   207k|  for (int i = 0; i < height; ++i) {
  ------------------
  |  Branch (53:19): [True: 203k, False: 4.45k]
  ------------------
   54|   203k|    _mm256_storeu_si256((__m256i *)dst, *r0);
   55|   203k|    _mm256_storeu_si256((__m256i *)(dst + 32), *r1);
   56|   203k|    dst += stride;
   57|   203k|  }
   58|  4.45k|}
intrapred_avx2.c:get_top_vector:
  759|  2.23M|static inline __m256i get_top_vector(const uint8_t *above) {
  760|  2.23M|  const __m128i x = _mm_load_si128((const __m128i *)above);
  761|  2.23M|  const __m128i zero = _mm_setzero_si128();
  762|  2.23M|  const __m128i t0 = _mm_unpacklo_epi8(x, zero);
  763|  2.23M|  const __m128i t1 = _mm_unpackhi_epi8(x, zero);
  764|  2.23M|  return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1);
  765|  2.23M|}
intrapred_avx2.c:paeth_16x1_pred:
  752|  78.8M|                                      const __m256i *topleft) {
  753|  78.8M|  const __m256i p0 = paeth_pred(left, top, topleft);
  754|  78.8M|  const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
  755|  78.8M|  const __m256i p = _mm256_packus_epi16(p0, p1);
  756|  78.8M|  return _mm256_castsi256_si128(p);
  757|  78.8M|}
intrapred_avx2.c:paeth_pred:
  728|  79.7M|                                 const __m256i *topleft) {
  729|  79.7M|  const __m256i base =
  730|  79.7M|      _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft);
  731|       |
  732|  79.7M|  __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left));
  733|  79.7M|  __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top));
  734|  79.7M|  __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft));
  735|       |
  736|  79.7M|  __m256i mask1 = _mm256_cmpgt_epi16(pl, pt);
  737|  79.7M|  mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl));
  738|  79.7M|  __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl);
  739|       |
  740|  79.7M|  pl = _mm256_andnot_si256(mask1, *left);
  741|       |
  742|  79.7M|  ptl = _mm256_and_si256(mask2, *topleft);
  743|  79.7M|  pt = _mm256_andnot_si256(mask2, *top);
  744|  79.7M|  pt = _mm256_or_si256(pt, ptl);
  745|  79.7M|  pt = _mm256_and_si256(mask1, pt);
  746|       |
  747|  79.7M|  return _mm256_or_si256(pt, pl);
  748|  79.7M|}
intrapred_avx2.c:get_left_vector:
  787|  3.82M|static inline __m256i get_left_vector(const uint8_t *left) {
  788|  3.82M|  const __m128i x = _mm_load_si128((const __m128i *)left);
  789|  3.82M|  return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
  790|  3.82M|}
intrapred_avx2.c:paeth_32x1_pred:
  866|   464k|                                      const __m256i *topleft) {
  867|   464k|  __m256i p0 = paeth_pred(left, top0, topleft);
  868|   464k|  __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
  869|   464k|  const __m256i x0 = _mm256_packus_epi16(p0, p1);
  870|       |
  871|   464k|  p0 = paeth_pred(left, top1, topleft);
  872|   464k|  p1 = _mm256_permute4x64_epi64(p0, 0xe);
  873|   464k|  const __m256i x1 = _mm256_packus_epi16(p0, p1);
  874|       |
  875|   464k|  return _mm256_permute2x128_si256(x0, x1, 0x20);
  876|   464k|}
intrapred_avx2.c:highbd_dr_prediction_z1_4xN_avx2:
 1207|   183k|                                             int bd) {
 1208|   183k|  __m128i dstvec[16];
 1209|   183k|  if (bd < 12) {
  ------------------
  |  Branch (1209:7): [True: 116k, False: 67.2k]
  ------------------
 1210|   116k|    highbd_dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above,
 1211|   116k|                                              dx);
 1212|   116k|  } else {
 1213|  67.2k|    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(N, dstvec, above,
 1214|  67.2k|                                                    upsample_above, dx);
 1215|  67.2k|  }
 1216|  1.40M|  for (int i = 0; i < N; i++) {
  ------------------
  |  Branch (1216:19): [True: 1.22M, False: 183k]
  ------------------
 1217|  1.22M|    _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
 1218|  1.22M|  }
 1219|   183k|}
intrapred_avx2.c:highbd_dr_prediction_z1_4xN_internal_avx2:
 1064|   393k|    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
 1065|   393k|  const int frac_bits = 6 - upsample_above;
 1066|   393k|  const int max_base_x = ((N + 4) - 1) << upsample_above;
 1067|       |
 1068|   393k|  assert(dx > 0);
 1069|       |  // pre-filter above pixels
 1070|       |  // store in temp buffers:
 1071|       |  //   above[x] * 32 + 16
 1072|       |  //   above[x+1] - above[x]
 1073|       |  // final pixels will be calculated as:
 1074|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 1075|   393k|  __m256i a0, a1, a32, a16;
 1076|   393k|  __m256i diff, c3f;
 1077|   393k|  __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
 1078|   393k|  __m128i a0_128, a1_128;
 1079|   393k|  a16 = _mm256_set1_epi16(16);
 1080|   393k|  a_mbase_x = _mm_set1_epi16(above[max_base_x]);
 1081|   393k|  max_base_x128 = _mm_set1_epi16(max_base_x);
 1082|   393k|  c3f = _mm256_set1_epi16(0x3f);
 1083|       |
 1084|   393k|  int x = dx;
 1085|  3.00M|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (1085:19): [True: 2.61M, False: 389k]
  ------------------
 1086|  2.61M|    __m256i b, res, shift;
 1087|  2.61M|    __m128i res1;
 1088|       |
 1089|  2.61M|    int base = x >> frac_bits;
 1090|  2.61M|    if (base >= max_base_x) {
  ------------------
  |  Branch (1090:9): [True: 3.53k, False: 2.61M]
  ------------------
 1091|  8.51k|      for (int i = r; i < N; ++i) {
  ------------------
  |  Branch (1091:23): [True: 4.97k, False: 3.53k]
  ------------------
 1092|  4.97k|        dst[i] = a_mbase_x;  // save 4 values
 1093|  4.97k|      }
 1094|  3.53k|      return;
 1095|  3.53k|    }
 1096|       |
 1097|  2.61M|    a0_128 = _mm_loadu_si128((__m128i *)(above + base));
 1098|  2.61M|    a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
 1099|       |
 1100|  2.61M|    if (upsample_above) {
  ------------------
  |  Branch (1100:9): [True: 1.07M, False: 1.54M]
  ------------------
 1101|  1.07M|      a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)HighbdEvenOddMaskx4[0]);
 1102|  1.07M|      a1_128 = _mm_srli_si128(a0_128, 8);
 1103|       |
 1104|  1.07M|      base_inc128 = _mm_setr_epi16(base, base + 2, base + 4, base + 6, base + 8,
 1105|  1.07M|                                   base + 10, base + 12, base + 14);
 1106|  1.07M|      shift = _mm256_srli_epi16(
 1107|  1.07M|          _mm256_and_si256(
 1108|  1.07M|              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above),
 1109|  1.07M|              _mm256_set1_epi16(0x3f)),
 1110|  1.07M|          1);
 1111|  1.54M|    } else {
 1112|  1.54M|      base_inc128 = _mm_setr_epi16(base, base + 1, base + 2, base + 3, base + 4,
 1113|  1.54M|                                   base + 5, base + 6, base + 7);
 1114|  1.54M|      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
 1115|  1.54M|    }
 1116|  2.61M|    a0 = _mm256_castsi128_si256(a0_128);
 1117|  2.61M|    a1 = _mm256_castsi128_si256(a1_128);
 1118|  2.61M|    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
 1119|  2.61M|    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
 1120|  2.61M|    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
 1121|       |
 1122|  2.61M|    b = _mm256_mullo_epi16(diff, shift);
 1123|  2.61M|    res = _mm256_add_epi16(a32, b);
 1124|  2.61M|    res = _mm256_srli_epi16(res, 5);
 1125|  2.61M|    res1 = _mm256_castsi256_si128(res);
 1126|       |
 1127|  2.61M|    mask128 = _mm_cmpgt_epi16(max_base_x128, base_inc128);
 1128|  2.61M|    dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
 1129|  2.61M|    x += dx;
 1130|  2.61M|  }
 1131|   393k|}
intrapred_avx2.c:highbd_dr_prediction_32bit_z1_4xN_internal_avx2:
 1134|   145k|    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
 1135|   145k|  const int frac_bits = 6 - upsample_above;
 1136|   145k|  const int max_base_x = ((N + 4) - 1) << upsample_above;
 1137|       |
 1138|   145k|  assert(dx > 0);
 1139|       |  // pre-filter above pixels
 1140|       |  // store in temp buffers:
 1141|       |  //   above[x] * 32 + 16
 1142|       |  //   above[x+1] - above[x]
 1143|       |  // final pixels will be calculated as:
 1144|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 1145|   145k|  __m256i a0, a1, a32, a16;
 1146|   145k|  __m256i diff;
 1147|   145k|  __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
 1148|       |
 1149|   145k|  a16 = _mm256_set1_epi32(16);
 1150|   145k|  a_mbase_x = _mm_set1_epi16(above[max_base_x]);
 1151|   145k|  max_base_x128 = _mm_set1_epi32(max_base_x);
 1152|       |
 1153|   145k|  int x = dx;
 1154|  1.20M|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (1154:19): [True: 1.05M, False: 145k]
  ------------------
 1155|  1.05M|    __m256i b, res, shift;
 1156|  1.05M|    __m128i res1;
 1157|       |
 1158|  1.05M|    int base = x >> frac_bits;
 1159|  1.05M|    if (base >= max_base_x) {
  ------------------
  |  Branch (1159:9): [True: 828, False: 1.05M]
  ------------------
 1160|  2.78k|      for (int i = r; i < N; ++i) {
  ------------------
  |  Branch (1160:23): [True: 1.96k, False: 828]
  ------------------
 1161|  1.96k|        dst[i] = a_mbase_x;  // save 4 values
 1162|  1.96k|      }
 1163|    828|      return;
 1164|    828|    }
 1165|       |
 1166|  1.05M|    a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
 1167|  1.05M|    a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
 1168|       |
 1169|  1.05M|    if (upsample_above) {
  ------------------
  |  Branch (1169:9): [True: 125k, False: 928k]
  ------------------
 1170|   125k|      a0 = _mm256_permutevar8x32_epi32(
 1171|   125k|          a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
 1172|   125k|      a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
 1173|   125k|      base_inc128 = _mm_setr_epi32(base, base + 2, base + 4, base + 6);
 1174|   125k|      shift = _mm256_srli_epi32(
 1175|   125k|          _mm256_and_si256(
 1176|   125k|              _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
 1177|   125k|              _mm256_set1_epi32(0x3f)),
 1178|   125k|          1);
 1179|   928k|    } else {
 1180|   928k|      base_inc128 = _mm_setr_epi32(base, base + 1, base + 2, base + 3);
 1181|   928k|      shift = _mm256_srli_epi32(
 1182|   928k|          _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
 1183|   928k|    }
 1184|       |
 1185|  1.05M|    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
 1186|  1.05M|    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
 1187|  1.05M|    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
 1188|       |
 1189|  1.05M|    b = _mm256_mullo_epi32(diff, shift);
 1190|  1.05M|    res = _mm256_add_epi32(a32, b);
 1191|  1.05M|    res = _mm256_srli_epi32(res, 5);
 1192|       |
 1193|  1.05M|    res1 = _mm256_castsi256_si128(res);
 1194|  1.05M|    res1 = _mm_packus_epi32(res1, res1);
 1195|       |
 1196|  1.05M|    mask128 = _mm_cmpgt_epi32(max_base_x128, base_inc128);
 1197|  1.05M|    mask128 = _mm_packs_epi32(mask128, mask128);  // goto 16 bit
 1198|  1.05M|    dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
 1199|  1.05M|    x += dx;
 1200|  1.05M|  }
 1201|   145k|}
intrapred_avx2.c:highbd_dr_prediction_z1_8xN_avx2:
 1390|   269k|                                             int bd) {
 1391|   269k|  __m128i dstvec[32];
 1392|   269k|  if (bd < 12) {
  ------------------
  |  Branch (1392:7): [True: 146k, False: 123k]
  ------------------
 1393|   146k|    highbd_dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above,
 1394|   146k|                                              dx);
 1395|   146k|  } else {
 1396|   123k|    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(N, dstvec, above,
 1397|   123k|                                                    upsample_above, dx);
 1398|   123k|  }
 1399|  2.78M|  for (int i = 0; i < N; i++) {
  ------------------
  |  Branch (1399:19): [True: 2.52M, False: 269k]
  ------------------
 1400|  2.52M|    _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
 1401|  2.52M|  }
 1402|   269k|}
intrapred_avx2.c:highbd_dr_prediction_z1_8xN_internal_avx2:
 1305|   376k|    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
 1306|   376k|  const int frac_bits = 6 - upsample_above;
 1307|   376k|  const int max_base_x = ((8 + N) - 1) << upsample_above;
 1308|       |
 1309|   376k|  assert(dx > 0);
 1310|       |  // pre-filter above pixels
 1311|       |  // store in temp buffers:
 1312|       |  //   above[x] * 32 + 16
 1313|       |  //   above[x+1] - above[x]
 1314|       |  // final pixels will be calculated as:
 1315|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 1316|   376k|  __m256i a0, a1, a32, a16, c3f;
 1317|   376k|  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
 1318|   376k|  __m128i a0_x128, a1_x128;
 1319|       |
 1320|   376k|  a16 = _mm256_set1_epi16(16);
 1321|   376k|  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
 1322|   376k|  max_base_x256 = _mm256_set1_epi16(max_base_x);
 1323|   376k|  c3f = _mm256_set1_epi16(0x3f);
 1324|       |
 1325|   376k|  int x = dx;
 1326|  5.12M|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (1326:19): [True: 4.75M, False: 375k]
  ------------------
 1327|  4.75M|    __m256i b, res, res1, shift;
 1328|       |
 1329|  4.75M|    int base = x >> frac_bits;
 1330|  4.75M|    if (base >= max_base_x) {
  ------------------
  |  Branch (1330:9): [True: 1.44k, False: 4.75M]
  ------------------
 1331|  5.12k|      for (int i = r; i < N; ++i) {
  ------------------
  |  Branch (1331:23): [True: 3.67k, False: 1.44k]
  ------------------
 1332|  3.67k|        dst[i] = _mm256_castsi256_si128(a_mbase_x);  // save 8 values
 1333|  3.67k|      }
 1334|  1.44k|      return;
 1335|  1.44k|    }
 1336|       |
 1337|  4.75M|    a0_x128 = _mm_loadu_si128((__m128i *)(above + base));
 1338|  4.75M|    if (upsample_above) {
  ------------------
  |  Branch (1338:9): [True: 1.12M, False: 3.62M]
  ------------------
 1339|  1.12M|      __m128i mask, atmp0, atmp1, atmp2, atmp3;
 1340|  1.12M|      a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 8));
 1341|  1.12M|      atmp0 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
 1342|  1.12M|      atmp1 = _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
 1343|  1.12M|      atmp2 =
 1344|  1.12M|          _mm_shuffle_epi8(a0_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
 1345|  1.12M|      atmp3 =
 1346|  1.12M|          _mm_shuffle_epi8(a1_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
 1347|  1.12M|      mask =
 1348|  1.12M|          _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[0], _mm_set1_epi8(15));
 1349|  1.12M|      a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
 1350|  1.12M|      mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[0] + 16),
 1351|  1.12M|                            _mm_set1_epi8(15));
 1352|  1.12M|      a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
 1353|       |
 1354|  1.12M|      base_inc256 = _mm256_setr_epi16(base, base + 2, base + 4, base + 6,
 1355|  1.12M|                                      base + 8, base + 10, base + 12, base + 14,
 1356|  1.12M|                                      0, 0, 0, 0, 0, 0, 0, 0);
 1357|  1.12M|      shift = _mm256_srli_epi16(
 1358|  1.12M|          _mm256_and_si256(
 1359|  1.12M|              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
 1360|  1.12M|          1);
 1361|  3.62M|    } else {
 1362|  3.62M|      a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 1));
 1363|  3.62M|      base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
 1364|  3.62M|                                      base + 4, base + 5, base + 6, base + 7, 0,
 1365|  3.62M|                                      0, 0, 0, 0, 0, 0, 0);
 1366|  3.62M|      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
 1367|  3.62M|    }
 1368|  4.75M|    a0 = _mm256_castsi128_si256(a0_x128);
 1369|  4.75M|    a1 = _mm256_castsi128_si256(a1_x128);
 1370|       |
 1371|  4.75M|    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
 1372|  4.75M|    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
 1373|  4.75M|    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
 1374|       |
 1375|  4.75M|    b = _mm256_mullo_epi16(diff, shift);
 1376|  4.75M|    res = _mm256_add_epi16(a32, b);
 1377|  4.75M|    res = _mm256_srli_epi16(res, 5);
 1378|       |
 1379|  4.75M|    mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
 1380|  4.75M|    res1 = _mm256_blendv_epi8(a_mbase_x, res, mask256);
 1381|  4.75M|    dst[r] = _mm256_castsi256_si128(res1);
 1382|  4.75M|    x += dx;
 1383|  4.75M|  }
 1384|   376k|}
intrapred_avx2.c:highbd_dr_prediction_32bit_z1_8xN_internal_avx2:
 1222|   282k|    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
 1223|   282k|  const int frac_bits = 6 - upsample_above;
 1224|   282k|  const int max_base_x = ((8 + N) - 1) << upsample_above;
 1225|       |
 1226|   282k|  assert(dx > 0);
 1227|       |  // pre-filter above pixels
 1228|       |  // store in temp buffers:
 1229|       |  //   above[x] * 32 + 16
 1230|       |  //   above[x+1] - above[x]
 1231|       |  // final pixels will be calculated as:
 1232|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 1233|   282k|  __m256i a0, a1, a0_1, a1_1, a32, a16;
 1234|   282k|  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
 1235|       |
 1236|   282k|  a16 = _mm256_set1_epi32(16);
 1237|   282k|  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
 1238|   282k|  max_base_x256 = _mm256_set1_epi32(max_base_x);
 1239|       |
 1240|   282k|  int x = dx;
 1241|  3.07M|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (1241:19): [True: 2.79M, False: 281k]
  ------------------
 1242|  2.79M|    __m256i b, res, res1, shift;
 1243|       |
 1244|  2.79M|    int base = x >> frac_bits;
 1245|  2.79M|    if (base >= max_base_x) {
  ------------------
  |  Branch (1245:9): [True: 817, False: 2.79M]
  ------------------
 1246|  2.34k|      for (int i = r; i < N; ++i) {
  ------------------
  |  Branch (1246:23): [True: 1.52k, False: 817]
  ------------------
 1247|  1.52k|        dst[i] = _mm256_castsi256_si128(a_mbase_x);  // save 8 values
 1248|  1.52k|      }
 1249|    817|      return;
 1250|    817|    }
 1251|       |
 1252|  2.79M|    a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
 1253|  2.79M|    a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
 1254|       |
 1255|  2.79M|    if (upsample_above) {
  ------------------
  |  Branch (1255:9): [True: 300k, False: 2.49M]
  ------------------
 1256|   300k|      a0 = _mm256_permutevar8x32_epi32(
 1257|   300k|          a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
 1258|   300k|      a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
 1259|       |
 1260|   300k|      a0_1 =
 1261|   300k|          _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
 1262|   300k|      a0_1 = _mm256_permutevar8x32_epi32(
 1263|   300k|          a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
 1264|   300k|      a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1));
 1265|       |
 1266|   300k|      a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1);
 1267|   300k|      a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1);
 1268|   300k|      base_inc256 =
 1269|   300k|          _mm256_setr_epi32(base, base + 2, base + 4, base + 6, base + 8,
 1270|   300k|                            base + 10, base + 12, base + 14);
 1271|   300k|      shift = _mm256_srli_epi32(
 1272|   300k|          _mm256_and_si256(
 1273|   300k|              _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
 1274|   300k|              _mm256_set1_epi32(0x3f)),
 1275|   300k|          1);
 1276|  2.49M|    } else {
 1277|  2.49M|      base_inc256 = _mm256_setr_epi32(base, base + 1, base + 2, base + 3,
 1278|  2.49M|                                      base + 4, base + 5, base + 6, base + 7);
 1279|  2.49M|      shift = _mm256_srli_epi32(
 1280|  2.49M|          _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
 1281|  2.49M|    }
 1282|       |
 1283|  2.79M|    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
 1284|  2.79M|    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
 1285|  2.79M|    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
 1286|       |
 1287|  2.79M|    b = _mm256_mullo_epi32(diff, shift);
 1288|  2.79M|    res = _mm256_add_epi32(a32, b);
 1289|  2.79M|    res = _mm256_srli_epi32(res, 5);
 1290|       |
 1291|  2.79M|    res1 = _mm256_packus_epi32(
 1292|  2.79M|        res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
 1293|       |
 1294|  2.79M|    mask256 = _mm256_cmpgt_epi32(max_base_x256, base_inc256);
 1295|  2.79M|    mask256 = _mm256_packs_epi32(
 1296|  2.79M|        mask256, _mm256_castsi128_si256(
 1297|  2.79M|                     _mm256_extracti128_si256(mask256, 1)));  // goto 16 bit
 1298|  2.79M|    res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
 1299|  2.79M|    dst[r] = _mm256_castsi256_si128(res1);
 1300|  2.79M|    x += dx;
 1301|  2.79M|  }
 1302|   282k|}
intrapred_avx2.c:highbd_dr_prediction_z1_16xN_avx2:
 1543|   209k|                                              int bd) {
 1544|   209k|  __m256i dstvec[64];
 1545|   209k|  if (bd < 12) {
  ------------------
  |  Branch (1545:7): [True: 121k, False: 88.0k]
  ------------------
 1546|   121k|    highbd_dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above,
 1547|   121k|                                               dx);
 1548|   121k|  } else {
 1549|  88.0k|    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(N, dstvec, above,
 1550|  88.0k|                                                     upsample_above, dx);
 1551|  88.0k|  }
 1552|  2.91M|  for (int i = 0; i < N; i++) {
  ------------------
  |  Branch (1552:19): [True: 2.70M, False: 209k]
  ------------------
 1553|  2.70M|    _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
 1554|  2.70M|  }
 1555|   209k|}
intrapred_avx2.c:highbd_dr_prediction_z1_16xN_internal_avx2:
 1484|   304k|    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
 1485|       |  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
 1486|   304k|  (void)upsample_above;
 1487|   304k|  const int frac_bits = 6;
 1488|   304k|  const int max_base_x = ((16 + N) - 1);
 1489|       |
 1490|       |  // pre-filter above pixels
 1491|       |  // store in temp buffers:
 1492|       |  //   above[x] * 32 + 16
 1493|       |  //   above[x+1] - above[x]
 1494|       |  // final pixels will be calculated as:
 1495|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 1496|   304k|  __m256i a0, a1, a32, a16, c3f;
 1497|   304k|  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
 1498|       |
 1499|   304k|  a16 = _mm256_set1_epi16(16);
 1500|   304k|  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
 1501|   304k|  max_base_x256 = _mm256_set1_epi16(max_base_x);
 1502|   304k|  c3f = _mm256_set1_epi16(0x3f);
 1503|       |
 1504|   304k|  int x = dx;
 1505|  5.80M|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (1505:19): [True: 5.50M, False: 304k]
  ------------------
 1506|  5.50M|    __m256i b, res;
 1507|       |
 1508|  5.50M|    int base = x >> frac_bits;
 1509|  5.50M|    if (base >= max_base_x) {
  ------------------
  |  Branch (1509:9): [True: 452, False: 5.50M]
  ------------------
 1510|  2.26k|      for (int i = r; i < N; ++i) {
  ------------------
  |  Branch (1510:23): [True: 1.81k, False: 452]
  ------------------
 1511|  1.81k|        dstvec[i] = a_mbase_x;  // save 16 values
 1512|  1.81k|      }
 1513|    452|      return;
 1514|    452|    }
 1515|  5.50M|    __m256i shift =
 1516|  5.50M|        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
 1517|       |
 1518|  5.50M|    a0 = _mm256_loadu_si256((__m256i *)(above + base));
 1519|  5.50M|    a1 = _mm256_loadu_si256((__m256i *)(above + base + 1));
 1520|       |
 1521|  5.50M|    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
 1522|  5.50M|    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
 1523|  5.50M|    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
 1524|  5.50M|    b = _mm256_mullo_epi16(diff, shift);
 1525|       |
 1526|  5.50M|    res = _mm256_add_epi16(a32, b);
 1527|  5.50M|    res = _mm256_srli_epi16(res, 5);  // 16 16bit values
 1528|       |
 1529|  5.50M|    base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
 1530|  5.50M|                                    base + 4, base + 5, base + 6, base + 7,
 1531|  5.50M|                                    base + 8, base + 9, base + 10, base + 11,
 1532|  5.50M|                                    base + 12, base + 13, base + 14, base + 15);
 1533|  5.50M|    mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
 1534|  5.50M|    dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res, mask256);
 1535|  5.50M|    x += dx;
 1536|  5.50M|  }
 1537|   304k|}
intrapred_avx2.c:highbd_dr_prediction_32bit_z1_16xN_internal_avx2:
 1405|   131k|    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
 1406|       |  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
 1407|   131k|  (void)upsample_above;
 1408|   131k|  const int frac_bits = 6;
 1409|   131k|  const int max_base_x = ((16 + N) - 1);
 1410|       |
 1411|       |  // pre-filter above pixels
 1412|       |  // store in temp buffers:
 1413|       |  //   above[x] * 32 + 16
 1414|       |  //   above[x+1] - above[x]
 1415|       |  // final pixels will be calculated as:
 1416|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 1417|   131k|  __m256i a0, a0_1, a1, a1_1, a32, a16;
 1418|   131k|  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
 1419|       |
 1420|   131k|  a16 = _mm256_set1_epi32(16);
 1421|   131k|  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
 1422|   131k|  max_base_x256 = _mm256_set1_epi16(max_base_x);
 1423|       |
 1424|   131k|  int x = dx;
 1425|  1.64M|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (1425:19): [True: 1.51M, False: 131k]
  ------------------
 1426|  1.51M|    __m256i b, res[2], res1;
 1427|       |
 1428|  1.51M|    int base = x >> frac_bits;
 1429|  1.51M|    if (base >= max_base_x) {
  ------------------
  |  Branch (1429:9): [True: 113, False: 1.51M]
  ------------------
 1430|    534|      for (int i = r; i < N; ++i) {
  ------------------
  |  Branch (1430:23): [True: 421, False: 113]
  ------------------
 1431|    421|        dstvec[i] = a_mbase_x;  // save 16 values
 1432|    421|      }
 1433|    113|      return;
 1434|    113|    }
 1435|  1.51M|    __m256i shift = _mm256_srli_epi32(
 1436|  1.51M|        _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
 1437|       |
 1438|  1.51M|    a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
 1439|  1.51M|    a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
 1440|       |
 1441|  1.51M|    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
 1442|  1.51M|    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
 1443|  1.51M|    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
 1444|  1.51M|    b = _mm256_mullo_epi32(diff, shift);
 1445|       |
 1446|  1.51M|    res[0] = _mm256_add_epi32(a32, b);
 1447|  1.51M|    res[0] = _mm256_srli_epi32(res[0], 5);
 1448|  1.51M|    res[0] = _mm256_packus_epi32(
 1449|  1.51M|        res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
 1450|       |
 1451|  1.51M|    int mdif = max_base_x - base;
 1452|  1.51M|    if (mdif > 8) {
  ------------------
  |  Branch (1452:9): [True: 1.51M, False: 2.71k]
  ------------------
 1453|  1.51M|      a0_1 =
 1454|  1.51M|          _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
 1455|  1.51M|      a1_1 =
 1456|  1.51M|          _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 9)));
 1457|       |
 1458|  1.51M|      diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
 1459|  1.51M|      a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
 1460|  1.51M|      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
 1461|  1.51M|      b = _mm256_mullo_epi32(diff, shift);
 1462|       |
 1463|  1.51M|      res[1] = _mm256_add_epi32(a32, b);
 1464|  1.51M|      res[1] = _mm256_srli_epi32(res[1], 5);
 1465|  1.51M|      res[1] = _mm256_packus_epi32(
 1466|  1.51M|          res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
 1467|  1.51M|    } else {
 1468|  2.71k|      res[1] = a_mbase_x;
 1469|  2.71k|    }
 1470|  1.51M|    res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
 1471|  1.51M|                                   1);  // 16 16bit values
 1472|       |
 1473|  1.51M|    base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
 1474|  1.51M|                                    base + 4, base + 5, base + 6, base + 7,
 1475|  1.51M|                                    base + 8, base + 9, base + 10, base + 11,
 1476|  1.51M|                                    base + 12, base + 13, base + 14, base + 15);
 1477|  1.51M|    mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
 1478|  1.51M|    dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
 1479|  1.51M|    x += dx;
 1480|  1.51M|  }
 1481|   131k|}
intrapred_avx2.c:highbd_dr_prediction_z1_32xN_avx2:
 1730|  80.9k|                                              int bd) {
 1731|  80.9k|  __m256i dstvec[128];
 1732|  80.9k|  if (bd < 12) {
  ------------------
  |  Branch (1732:7): [True: 69.8k, False: 11.0k]
  ------------------
 1733|  69.8k|    highbd_dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above,
 1734|  69.8k|                                               dx);
 1735|  69.8k|  } else {
 1736|  11.0k|    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(N, dstvec, above,
 1737|  11.0k|                                                     upsample_above, dx);
 1738|  11.0k|  }
 1739|  2.16M|  for (int i = 0; i < N; i++) {
  ------------------
  |  Branch (1739:19): [True: 2.08M, False: 80.9k]
  ------------------
 1740|  2.08M|    _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
 1741|  2.08M|    _mm256_storeu_si256((__m256i *)(dst + stride * i + 16), dstvec[i + N]);
 1742|  2.08M|  }
 1743|  80.9k|}
intrapred_avx2.c:highbd_dr_prediction_z1_32xN_internal_avx2:
 1655|   193k|    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
 1656|       |  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
 1657|   193k|  (void)upsample_above;
 1658|   193k|  const int frac_bits = 6;
 1659|   193k|  const int max_base_x = ((32 + N) - 1);
 1660|       |
 1661|       |  // pre-filter above pixels
 1662|       |  // store in temp buffers:
 1663|       |  //   above[x] * 32 + 16
 1664|       |  //   above[x+1] - above[x]
 1665|       |  // final pixels will be calculated as:
 1666|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 1667|   193k|  __m256i a0, a1, a32, a16, c3f;
 1668|   193k|  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
 1669|       |
 1670|   193k|  a16 = _mm256_set1_epi16(16);
 1671|   193k|  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
 1672|   193k|  max_base_x256 = _mm256_set1_epi16(max_base_x);
 1673|   193k|  c3f = _mm256_set1_epi16(0x3f);
 1674|       |
 1675|   193k|  int x = dx;
 1676|  5.35M|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (1676:19): [True: 5.16M, False: 193k]
  ------------------
 1677|  5.16M|    __m256i b, res;
 1678|       |
 1679|  5.16M|    int base = x >> frac_bits;
 1680|  5.16M|    if (base >= max_base_x) {
  ------------------
  |  Branch (1680:9): [True: 0, False: 5.16M]
  ------------------
 1681|      0|      for (int i = r; i < N; ++i) {
  ------------------
  |  Branch (1681:23): [True: 0, False: 0]
  ------------------
 1682|      0|        dstvec[i] = a_mbase_x;  // save 32 values
 1683|      0|        dstvec[i + N] = a_mbase_x;
 1684|      0|      }
 1685|      0|      return;
 1686|      0|    }
 1687|       |
 1688|  5.16M|    __m256i shift =
 1689|  5.16M|        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
 1690|       |
 1691|  15.4M|    for (int j = 0; j < 32; j += 16) {
  ------------------
  |  Branch (1691:21): [True: 10.3M, False: 5.16M]
  ------------------
 1692|  10.3M|      int mdif = max_base_x - (base + j);
 1693|  10.3M|      if (mdif <= 0) {
  ------------------
  |  Branch (1693:11): [True: 1.13k, False: 10.3M]
  ------------------
 1694|  1.13k|        res = a_mbase_x;
 1695|  10.3M|      } else {
 1696|  10.3M|        a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
 1697|  10.3M|        a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
 1698|       |
 1699|  10.3M|        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
 1700|  10.3M|        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
 1701|  10.3M|        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
 1702|  10.3M|        b = _mm256_mullo_epi16(diff, shift);
 1703|       |
 1704|  10.3M|        res = _mm256_add_epi16(a32, b);
 1705|  10.3M|        res = _mm256_srli_epi16(res, 5);
 1706|       |
 1707|  10.3M|        base_inc256 = _mm256_setr_epi16(
 1708|  10.3M|            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
 1709|  10.3M|            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
 1710|  10.3M|            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
 1711|  10.3M|            base + j + 13, base + j + 14, base + j + 15);
 1712|       |
 1713|  10.3M|        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
 1714|  10.3M|        res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
 1715|  10.3M|      }
 1716|  10.3M|      if (!j) {
  ------------------
  |  Branch (1716:11): [True: 5.16M, False: 5.16M]
  ------------------
 1717|  5.16M|        dstvec[r] = res;
 1718|  5.16M|      } else {
 1719|  5.16M|        dstvec[r + N] = res;
 1720|  5.16M|      }
 1721|  10.3M|    }
 1722|  5.16M|    x += dx;
 1723|  5.16M|  }
 1724|   193k|}
intrapred_avx2.c:highbd_dr_prediction_32bit_z1_32xN_internal_avx2:
 1558|  21.7k|    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
 1559|       |  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
 1560|  21.7k|  (void)upsample_above;
 1561|  21.7k|  const int frac_bits = 6;
 1562|  21.7k|  const int max_base_x = ((32 + N) - 1);
 1563|       |
 1564|       |  // pre-filter above pixels
 1565|       |  // store in temp buffers:
 1566|       |  //   above[x] * 32 + 16
 1567|       |  //   above[x+1] - above[x]
 1568|       |  // final pixels will be calculated as:
 1569|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 1570|  21.7k|  __m256i a0, a0_1, a1, a1_1, a32, a16, c3f;
 1571|  21.7k|  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
 1572|       |
 1573|  21.7k|  a16 = _mm256_set1_epi32(16);
 1574|  21.7k|  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
 1575|  21.7k|  max_base_x256 = _mm256_set1_epi16(max_base_x);
 1576|  21.7k|  c3f = _mm256_set1_epi16(0x3f);
 1577|       |
 1578|  21.7k|  int x = dx;
 1579|   492k|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (1579:19): [True: 470k, False: 21.7k]
  ------------------
 1580|   470k|    __m256i b, res[2], res1;
 1581|       |
 1582|   470k|    int base = x >> frac_bits;
 1583|   470k|    if (base >= max_base_x) {
  ------------------
  |  Branch (1583:9): [True: 0, False: 470k]
  ------------------
 1584|      0|      for (int i = r; i < N; ++i) {
  ------------------
  |  Branch (1584:23): [True: 0, False: 0]
  ------------------
 1585|      0|        dstvec[i] = a_mbase_x;  // save 32 values
 1586|      0|        dstvec[i + N] = a_mbase_x;
 1587|      0|      }
 1588|      0|      return;
 1589|      0|    }
 1590|       |
 1591|   470k|    __m256i shift =
 1592|   470k|        _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
 1593|       |
 1594|  1.41M|    for (int j = 0; j < 32; j += 16) {
  ------------------
  |  Branch (1594:21): [True: 940k, False: 470k]
  ------------------
 1595|   940k|      int mdif = max_base_x - (base + j);
 1596|   940k|      if (mdif <= 0) {
  ------------------
  |  Branch (1596:11): [True: 681, False: 940k]
  ------------------
 1597|    681|        res1 = a_mbase_x;
 1598|   940k|      } else {
 1599|   940k|        a0 = _mm256_cvtepu16_epi32(
 1600|   940k|            _mm_loadu_si128((__m128i *)(above + base + j)));
 1601|   940k|        a1 = _mm256_cvtepu16_epi32(
 1602|   940k|            _mm_loadu_si128((__m128i *)(above + base + 1 + j)));
 1603|       |
 1604|   940k|        diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
 1605|   940k|        a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
 1606|   940k|        a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
 1607|   940k|        b = _mm256_mullo_epi32(diff, shift);
 1608|       |
 1609|   940k|        res[0] = _mm256_add_epi32(a32, b);
 1610|   940k|        res[0] = _mm256_srli_epi32(res[0], 5);
 1611|   940k|        res[0] = _mm256_packus_epi32(
 1612|   940k|            res[0],
 1613|   940k|            _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
 1614|   940k|        if (mdif > 8) {
  ------------------
  |  Branch (1614:13): [True: 936k, False: 3.42k]
  ------------------
 1615|   936k|          a0_1 = _mm256_cvtepu16_epi32(
 1616|   936k|              _mm_loadu_si128((__m128i *)(above + base + 8 + j)));
 1617|   936k|          a1_1 = _mm256_cvtepu16_epi32(
 1618|   936k|              _mm_loadu_si128((__m128i *)(above + base + 9 + j)));
 1619|       |
 1620|   936k|          diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
 1621|   936k|          a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
 1622|   936k|          a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
 1623|   936k|          b = _mm256_mullo_epi32(diff, shift);
 1624|       |
 1625|   936k|          res[1] = _mm256_add_epi32(a32, b);
 1626|   936k|          res[1] = _mm256_srli_epi32(res[1], 5);
 1627|   936k|          res[1] = _mm256_packus_epi32(
 1628|   936k|              res[1],
 1629|   936k|              _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
 1630|   936k|        } else {
 1631|  3.42k|          res[1] = a_mbase_x;
 1632|  3.42k|        }
 1633|   940k|        res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
 1634|   940k|                                       1);  // 16 16bit values
 1635|   940k|        base_inc256 = _mm256_setr_epi16(
 1636|   940k|            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
 1637|   940k|            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
 1638|   940k|            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
 1639|   940k|            base + j + 13, base + j + 14, base + j + 15);
 1640|       |
 1641|   940k|        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
 1642|   940k|        res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
 1643|   940k|      }
 1644|   940k|      if (!j) {
  ------------------
  |  Branch (1644:11): [True: 470k, False: 470k]
  ------------------
 1645|   470k|        dstvec[r] = res1;
 1646|   470k|      } else {
 1647|   470k|        dstvec[r + N] = res1;
 1648|   470k|      }
 1649|   940k|    }
 1650|   470k|    x += dx;
 1651|   470k|  }
 1652|  21.7k|}
intrapred_avx2.c:highbd_dr_prediction_z1_64xN_avx2:
 1847|  44.8k|                                              int upsample_above, int dx) {
 1848|       |  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
 1849|  44.8k|  (void)upsample_above;
 1850|  44.8k|  const int frac_bits = 6;
 1851|  44.8k|  const int max_base_x = ((64 + N) - 1);
 1852|       |
 1853|       |  // pre-filter above pixels
 1854|       |  // store in temp buffers:
 1855|       |  //   above[x] * 32 + 16
 1856|       |  //   above[x+1] - above[x]
 1857|       |  // final pixels will be calculated as:
 1858|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 1859|  44.8k|  __m256i a0, a1, a32, a16, c3f;
 1860|  44.8k|  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
 1861|       |
 1862|  44.8k|  a16 = _mm256_set1_epi16(16);
 1863|  44.8k|  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
 1864|  44.8k|  max_base_x256 = _mm256_set1_epi16(max_base_x);
 1865|  44.8k|  c3f = _mm256_set1_epi16(0x3f);
 1866|       |
 1867|  44.8k|  int x = dx;
 1868|  2.36M|  for (int r = 0; r < N; r++, dst += stride) {
  ------------------
  |  Branch (1868:19): [True: 2.32M, False: 44.8k]
  ------------------
 1869|  2.32M|    __m256i b, res;
 1870|       |
 1871|  2.32M|    int base = x >> frac_bits;
 1872|  2.32M|    if (base >= max_base_x) {
  ------------------
  |  Branch (1872:9): [True: 0, False: 2.32M]
  ------------------
 1873|      0|      for (int i = r; i < N; ++i) {
  ------------------
  |  Branch (1873:23): [True: 0, False: 0]
  ------------------
 1874|      0|        _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
 1875|      0|        _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
 1876|      0|        _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
 1877|      0|        _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
 1878|      0|        dst += stride;
 1879|      0|      }
 1880|      0|      return;
 1881|      0|    }
 1882|       |
 1883|  2.32M|    __m256i shift =
 1884|  2.32M|        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
 1885|       |
 1886|  11.6M|    for (int j = 0; j < 64; j += 16) {
  ------------------
  |  Branch (1886:21): [True: 9.28M, False: 2.32M]
  ------------------
 1887|  9.28M|      int mdif = max_base_x - (base + j);
 1888|  9.28M|      if (mdif <= 0) {
  ------------------
  |  Branch (1888:11): [True: 2.42k, False: 9.28M]
  ------------------
 1889|  2.42k|        _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
 1890|  9.28M|      } else {
 1891|  9.28M|        a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
 1892|  9.28M|        a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
 1893|       |
 1894|  9.28M|        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
 1895|  9.28M|        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
 1896|  9.28M|        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
 1897|  9.28M|        b = _mm256_mullo_epi16(diff, shift);
 1898|       |
 1899|  9.28M|        res = _mm256_add_epi16(a32, b);
 1900|  9.28M|        res = _mm256_srli_epi16(res, 5);
 1901|       |
 1902|  9.28M|        base_inc256 = _mm256_setr_epi16(
 1903|  9.28M|            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
 1904|  9.28M|            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
 1905|  9.28M|            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
 1906|  9.28M|            base + j + 13, base + j + 14, base + j + 15);
 1907|       |
 1908|  9.28M|        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
 1909|  9.28M|        res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
 1910|  9.28M|        _mm256_storeu_si256((__m256i *)(dst + j), res);  // 16 16bit values
 1911|  9.28M|      }
 1912|  9.28M|    }
 1913|  2.32M|    x += dx;
 1914|  2.32M|  }
 1915|  44.8k|}
intrapred_avx2.c:highbd_dr_prediction_32bit_z1_64xN_avx2:
 1749|  18.4k|                                                    int dx) {
 1750|       |  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
 1751|  18.4k|  (void)upsample_above;
 1752|  18.4k|  const int frac_bits = 6;
 1753|  18.4k|  const int max_base_x = ((64 + N) - 1);
 1754|       |
 1755|       |  // pre-filter above pixels
 1756|       |  // store in temp buffers:
 1757|       |  //   above[x] * 32 + 16
 1758|       |  //   above[x+1] - above[x]
 1759|       |  // final pixels will be calculated as:
 1760|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 1761|  18.4k|  __m256i a0, a0_1, a1, a1_1, a32, a16;
 1762|  18.4k|  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
 1763|       |
 1764|  18.4k|  a16 = _mm256_set1_epi32(16);
 1765|  18.4k|  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
 1766|  18.4k|  max_base_x256 = _mm256_set1_epi16(max_base_x);
 1767|       |
 1768|  18.4k|  int x = dx;
 1769|  1.12M|  for (int r = 0; r < N; r++, dst += stride) {
  ------------------
  |  Branch (1769:19): [True: 1.11M, False: 18.4k]
  ------------------
 1770|  1.11M|    __m256i b, res[2], res1;
 1771|       |
 1772|  1.11M|    int base = x >> frac_bits;
 1773|  1.11M|    if (base >= max_base_x) {
  ------------------
  |  Branch (1773:9): [True: 0, False: 1.11M]
  ------------------
 1774|      0|      for (int i = r; i < N; ++i) {
  ------------------
  |  Branch (1774:23): [True: 0, False: 0]
  ------------------
 1775|      0|        _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
 1776|      0|        _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
 1777|      0|        _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
 1778|      0|        _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
 1779|      0|        dst += stride;
 1780|      0|      }
 1781|      0|      return;
 1782|      0|    }
 1783|       |
 1784|  1.11M|    __m256i shift = _mm256_srli_epi32(
 1785|  1.11M|        _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
 1786|       |
 1787|  1.11M|    __m128i a0_128, a0_1_128, a1_128, a1_1_128;
 1788|  5.55M|    for (int j = 0; j < 64; j += 16) {
  ------------------
  |  Branch (1788:21): [True: 4.44M, False: 1.11M]
  ------------------
 1789|  4.44M|      int mdif = max_base_x - (base + j);
 1790|  4.44M|      if (mdif <= 0) {
  ------------------
  |  Branch (1790:11): [True: 3.50k, False: 4.44M]
  ------------------
 1791|  3.50k|        _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
 1792|  4.44M|      } else {
 1793|  4.44M|        a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
 1794|  4.44M|        a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
 1795|  4.44M|        a0 = _mm256_cvtepu16_epi32(a0_128);
 1796|  4.44M|        a1 = _mm256_cvtepu16_epi32(a1_128);
 1797|       |
 1798|  4.44M|        diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
 1799|  4.44M|        a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
 1800|  4.44M|        a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
 1801|  4.44M|        b = _mm256_mullo_epi32(diff, shift);
 1802|       |
 1803|  4.44M|        res[0] = _mm256_add_epi32(a32, b);
 1804|  4.44M|        res[0] = _mm256_srli_epi32(res[0], 5);
 1805|  4.44M|        res[0] = _mm256_packus_epi32(
 1806|  4.44M|            res[0],
 1807|  4.44M|            _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
 1808|  4.44M|        if (mdif > 8) {
  ------------------
  |  Branch (1808:13): [True: 4.43M, False: 5.50k]
  ------------------
 1809|  4.43M|          a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j));
 1810|  4.43M|          a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j));
 1811|  4.43M|          a0_1 = _mm256_cvtepu16_epi32(a0_1_128);
 1812|  4.43M|          a1_1 = _mm256_cvtepu16_epi32(a1_1_128);
 1813|       |
 1814|  4.43M|          diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
 1815|  4.43M|          a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
 1816|  4.43M|          a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
 1817|  4.43M|          b = _mm256_mullo_epi32(diff, shift);
 1818|       |
 1819|  4.43M|          res[1] = _mm256_add_epi32(a32, b);
 1820|  4.43M|          res[1] = _mm256_srli_epi32(res[1], 5);
 1821|  4.43M|          res[1] = _mm256_packus_epi32(
 1822|  4.43M|              res[1],
 1823|  4.43M|              _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
 1824|  4.43M|        } else {
 1825|  5.50k|          res[1] = a_mbase_x;
 1826|  5.50k|        }
 1827|  4.44M|        res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
 1828|  4.44M|                                       1);  // 16 16bit values
 1829|  4.44M|        base_inc256 = _mm256_setr_epi16(
 1830|  4.44M|            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
 1831|  4.44M|            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
 1832|  4.44M|            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
 1833|  4.44M|            base + j + 13, base + j + 14, base + j + 15);
 1834|       |
 1835|  4.44M|        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
 1836|  4.44M|        res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
 1837|  4.44M|        _mm256_storeu_si256((__m256i *)(dst + j), res1);
 1838|  4.44M|      }
 1839|  4.44M|    }
 1840|  1.11M|    x += dx;
 1841|  1.11M|  }
 1842|  18.4k|}
intrapred_avx2.c:highbd_dr_prediction_z2_Nx4_avx2:
 2107|   213k|    int dy) {
 2108|   213k|  const int min_base_x = -(1 << upsample_above);
 2109|   213k|  const int min_base_y = -(1 << upsample_left);
 2110|   213k|  const int frac_bits_x = 6 - upsample_above;
 2111|   213k|  const int frac_bits_y = 6 - upsample_left;
 2112|       |
 2113|   213k|  assert(dx > 0);
 2114|       |  // pre-filter above pixels
 2115|       |  // store in temp buffers:
 2116|       |  //   above[x] * 32 + 16
 2117|       |  //   above[x+1] - above[x]
 2118|       |  // final pixels will be calculated as:
 2119|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 2120|   213k|  __m256i a0_x, a1_x, a32, a16;
 2121|   213k|  __m256i diff;
 2122|   213k|  __m128i c3f, min_base_y128;
 2123|       |
 2124|   213k|  a16 = _mm256_set1_epi16(16);
 2125|   213k|  c3f = _mm_set1_epi16(0x3f);
 2126|   213k|  min_base_y128 = _mm_set1_epi16(min_base_y);
 2127|       |
 2128|  1.59M|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (2128:19): [True: 1.38M, False: 213k]
  ------------------
 2129|  1.38M|    __m256i b, res, shift;
 2130|  1.38M|    __m128i resx, resy, resxy;
 2131|  1.38M|    __m128i a0_x128, a1_x128;
 2132|  1.38M|    int y = r + 1;
 2133|  1.38M|    int base_x = (-y * dx) >> frac_bits_x;
 2134|  1.38M|    int base_shift = 0;
 2135|  1.38M|    if (base_x < (min_base_x - 1)) {
  ------------------
  |  Branch (2135:9): [True: 976k, False: 405k]
  ------------------
 2136|   976k|      base_shift = (min_base_x - base_x - 1) >> upsample_above;
 2137|   976k|    }
 2138|  1.38M|    int base_min_diff =
 2139|  1.38M|        (min_base_x - base_x + upsample_above) >> upsample_above;
 2140|  1.38M|    if (base_min_diff > 4) {
  ------------------
  |  Branch (2140:9): [True: 604k, False: 776k]
  ------------------
 2141|   604k|      base_min_diff = 4;
 2142|   776k|    } else {
 2143|   776k|      if (base_min_diff < 0) base_min_diff = 0;
  ------------------
  |  Branch (2143:11): [True: 0, False: 776k]
  ------------------
 2144|   776k|    }
 2145|       |
 2146|  1.38M|    if (base_shift > 3) {
  ------------------
  |  Branch (2146:9): [True: 604k, False: 776k]
  ------------------
 2147|   604k|      a0_x = _mm256_setzero_si256();
 2148|   604k|      a1_x = _mm256_setzero_si256();
 2149|   604k|      shift = _mm256_setzero_si256();
 2150|   776k|    } else {
 2151|   776k|      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
 2152|   776k|      if (upsample_above) {
  ------------------
  |  Branch (2152:11): [True: 325k, False: 450k]
  ------------------
 2153|   325k|        a0_x128 = _mm_shuffle_epi8(a0_x128,
 2154|   325k|                                   *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
 2155|   325k|        a1_x128 = _mm_srli_si128(a0_x128, 8);
 2156|       |
 2157|   325k|        shift = _mm256_castsi128_si256(_mm_srli_epi16(
 2158|   325k|            _mm_and_si128(
 2159|   325k|                _mm_slli_epi16(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
 2160|   325k|                                              (2 << 6) - y * dx,
 2161|   325k|                                              (3 << 6) - y * dx, 0, 0, 0, 0),
 2162|   325k|                               upsample_above),
 2163|   325k|                c3f),
 2164|   325k|            1));
 2165|   450k|      } else {
 2166|   450k|        a0_x128 =
 2167|   450k|            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
 2168|   450k|        a1_x128 = _mm_srli_si128(a0_x128, 2);
 2169|       |
 2170|   450k|        shift = _mm256_castsi128_si256(_mm_srli_epi16(
 2171|   450k|            _mm_and_si128(
 2172|   450k|                _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
 2173|   450k|                               (3 << 6) - y * dx, 0, 0, 0, 0),
 2174|   450k|                c3f),
 2175|   450k|            1));
 2176|   450k|      }
 2177|   776k|      a0_x = _mm256_castsi128_si256(a0_x128);
 2178|   776k|      a1_x = _mm256_castsi128_si256(a1_x128);
 2179|   776k|    }
 2180|       |    // y calc
 2181|  1.38M|    __m128i a0_y, a1_y, shifty;
 2182|  1.38M|    if (base_x < min_base_x) {
  ------------------
  |  Branch (2182:9): [True: 1.11M, False: 261k]
  ------------------
 2183|  1.11M|      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
 2184|  1.11M|      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
  ------------------
  |  |   19|  1.11M|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 2185|  1.11M|      r6 = _mm_set1_epi16(r << 6);
 2186|  1.11M|      dy128 = _mm_set1_epi16(dy);
 2187|  1.11M|      c1234 = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
 2188|  1.11M|      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
 2189|  1.11M|      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
 2190|  1.11M|      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
 2191|  1.11M|      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
 2192|  1.11M|      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
 2193|       |
 2194|  1.11M|      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
 2195|  1.11M|                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
 2196|  1.11M|      a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
 2197|  1.11M|                            left[base_y_c[2] + 1], left[base_y_c[3] + 1], 0, 0,
 2198|  1.11M|                            0, 0);
 2199|       |
 2200|  1.11M|      if (upsample_left) {
  ------------------
  |  Branch (2200:11): [True: 346k, False: 773k]
  ------------------
 2201|   346k|        shifty = _mm_srli_epi16(
 2202|   346k|            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
 2203|   773k|      } else {
 2204|   773k|        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
 2205|   773k|      }
 2206|  1.11M|      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
 2207|  1.11M|      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
 2208|  1.11M|      shift = _mm256_inserti128_si256(shift, shifty, 1);
 2209|  1.11M|    }
 2210|       |
 2211|  1.38M|    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
 2212|  1.38M|    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
 2213|  1.38M|    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
 2214|       |
 2215|  1.38M|    b = _mm256_mullo_epi16(diff, shift);
 2216|  1.38M|    res = _mm256_add_epi16(a32, b);
 2217|  1.38M|    res = _mm256_srli_epi16(res, 5);
 2218|       |
 2219|  1.38M|    resx = _mm256_castsi256_si128(res);
 2220|  1.38M|    resy = _mm256_extracti128_si256(res, 1);
 2221|  1.38M|    resxy =
 2222|  1.38M|        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
 2223|  1.38M|    _mm_storel_epi64((__m128i *)(dst), resxy);
 2224|  1.38M|    dst += stride;
 2225|  1.38M|  }
 2226|   213k|}
intrapred_avx2.c:highbd_dr_prediction_32bit_z2_Nx4_avx2:
 1981|   290k|    int dy) {
 1982|   290k|  const int min_base_x = -(1 << upsample_above);
 1983|   290k|  const int min_base_y = -(1 << upsample_left);
 1984|   290k|  const int frac_bits_x = 6 - upsample_above;
 1985|   290k|  const int frac_bits_y = 6 - upsample_left;
 1986|       |
 1987|   290k|  assert(dx > 0);
 1988|       |  // pre-filter above pixels
 1989|       |  // store in temp buffers:
 1990|       |  //   above[x] * 32 + 16
 1991|       |  //   above[x+1] - above[x]
 1992|       |  // final pixels will be calculated as:
 1993|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 1994|   290k|  __m256i a0_x, a1_x, a32, a16;
 1995|   290k|  __m256i diff;
 1996|   290k|  __m128i c3f, min_base_y128;
 1997|       |
 1998|   290k|  a16 = _mm256_set1_epi32(16);
 1999|   290k|  c3f = _mm_set1_epi32(0x3f);
 2000|   290k|  min_base_y128 = _mm_set1_epi32(min_base_y);
 2001|       |
 2002|  1.73M|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (2002:19): [True: 1.44M, False: 290k]
  ------------------
 2003|  1.44M|    __m256i b, res, shift;
 2004|  1.44M|    __m128i resx, resy, resxy;
 2005|  1.44M|    __m128i a0_x128, a1_x128;
 2006|  1.44M|    int y = r + 1;
 2007|  1.44M|    int base_x = (-y * dx) >> frac_bits_x;
 2008|  1.44M|    int base_shift = 0;
 2009|  1.44M|    if (base_x < (min_base_x - 1)) {
  ------------------
  |  Branch (2009:9): [True: 1.21M, False: 233k]
  ------------------
 2010|  1.21M|      base_shift = (min_base_x - base_x - 1) >> upsample_above;
 2011|  1.21M|    }
 2012|  1.44M|    int base_min_diff =
 2013|  1.44M|        (min_base_x - base_x + upsample_above) >> upsample_above;
 2014|  1.44M|    if (base_min_diff > 4) {
  ------------------
  |  Branch (2014:9): [True: 1.00M, False: 443k]
  ------------------
 2015|  1.00M|      base_min_diff = 4;
 2016|  1.00M|    } else {
 2017|   443k|      if (base_min_diff < 0) base_min_diff = 0;
  ------------------
  |  Branch (2017:11): [True: 0, False: 443k]
  ------------------
 2018|   443k|    }
 2019|       |
 2020|  1.44M|    if (base_shift > 3) {
  ------------------
  |  Branch (2020:9): [True: 1.00M, False: 443k]
  ------------------
 2021|  1.00M|      a0_x = _mm256_setzero_si256();
 2022|  1.00M|      a1_x = _mm256_setzero_si256();
 2023|  1.00M|      shift = _mm256_setzero_si256();
 2024|  1.00M|    } else {
 2025|   443k|      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
 2026|   443k|      if (upsample_above) {
  ------------------
  |  Branch (2026:11): [True: 76.6k, False: 366k]
  ------------------
 2027|  76.6k|        a0_x128 = _mm_shuffle_epi8(a0_x128,
 2028|  76.6k|                                   *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
 2029|  76.6k|        a1_x128 = _mm_srli_si128(a0_x128, 8);
 2030|       |
 2031|  76.6k|        shift = _mm256_castsi128_si256(_mm_srli_epi32(
 2032|  76.6k|            _mm_and_si128(
 2033|  76.6k|                _mm_slli_epi32(
 2034|  76.6k|                    _mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
 2035|  76.6k|                                   (2 << 6) - y * dx, (3 << 6) - y * dx),
 2036|  76.6k|                    upsample_above),
 2037|  76.6k|                c3f),
 2038|  76.6k|            1));
 2039|   366k|      } else {
 2040|   366k|        a0_x128 =
 2041|   366k|            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
 2042|   366k|        a1_x128 = _mm_srli_si128(a0_x128, 2);
 2043|       |
 2044|   366k|        shift = _mm256_castsi128_si256(_mm_srli_epi32(
 2045|   366k|            _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
 2046|   366k|                                         (2 << 6) - y * dx, (3 << 6) - y * dx),
 2047|   366k|                          c3f),
 2048|   366k|            1));
 2049|   366k|      }
 2050|   443k|      a0_x = _mm256_cvtepu16_epi32(a0_x128);
 2051|   443k|      a1_x = _mm256_cvtepu16_epi32(a1_x128);
 2052|   443k|    }
 2053|       |    // y calc
 2054|  1.44M|    __m128i a0_y, a1_y, shifty;
 2055|  1.44M|    if (base_x < min_base_x) {
  ------------------
  |  Branch (2055:9): [True: 1.29M, False: 148k]
  ------------------
 2056|  1.29M|      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
 2057|  1.29M|      DECLARE_ALIGNED(32, int, base_y_c[4]);
  ------------------
  |  |   19|  1.29M|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 2058|  1.29M|      r6 = _mm_set1_epi32(r << 6);
 2059|  1.29M|      dy128 = _mm_set1_epi32(dy);
 2060|  1.29M|      c1234 = _mm_setr_epi32(1, 2, 3, 4);
 2061|  1.29M|      y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128));
 2062|  1.29M|      base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y);
 2063|  1.29M|      mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128);
 2064|  1.29M|      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
 2065|  1.29M|      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
 2066|       |
 2067|  1.29M|      a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]],
 2068|  1.29M|                            left[base_y_c[2]], left[base_y_c[3]]);
 2069|  1.29M|      a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
 2070|  1.29M|                            left[base_y_c[2] + 1], left[base_y_c[3] + 1]);
 2071|       |
 2072|  1.29M|      if (upsample_left) {
  ------------------
  |  Branch (2072:11): [True: 229k, False: 1.06M]
  ------------------
 2073|   229k|        shifty = _mm_srli_epi32(
 2074|   229k|            _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1);
 2075|  1.06M|      } else {
 2076|  1.06M|        shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1);
 2077|  1.06M|      }
 2078|  1.29M|      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
 2079|  1.29M|      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
 2080|  1.29M|      shift = _mm256_inserti128_si256(shift, shifty, 1);
 2081|  1.29M|    }
 2082|       |
 2083|  1.44M|    diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
 2084|  1.44M|    a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
 2085|  1.44M|    a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
 2086|       |
 2087|  1.44M|    b = _mm256_mullo_epi32(diff, shift);
 2088|  1.44M|    res = _mm256_add_epi32(a32, b);
 2089|  1.44M|    res = _mm256_srli_epi32(res, 5);
 2090|       |
 2091|  1.44M|    resx = _mm256_castsi256_si128(res);
 2092|  1.44M|    resx = _mm_packus_epi32(resx, resx);
 2093|       |
 2094|  1.44M|    resy = _mm256_extracti128_si256(res, 1);
 2095|  1.44M|    resy = _mm_packus_epi32(resy, resy);
 2096|       |
 2097|  1.44M|    resxy =
 2098|  1.44M|        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
 2099|  1.44M|    _mm_storel_epi64((__m128i *)(dst), resxy);
 2100|  1.44M|    dst += stride;
 2101|  1.44M|  }
 2102|   290k|}
intrapred_avx2.c:highbd_dr_prediction_z2_Nx8_avx2:
 2381|   252k|    int dy) {
 2382|   252k|  const int min_base_x = -(1 << upsample_above);
 2383|   252k|  const int min_base_y = -(1 << upsample_left);
 2384|   252k|  const int frac_bits_x = 6 - upsample_above;
 2385|   252k|  const int frac_bits_y = 6 - upsample_left;
 2386|       |
 2387|       |  // pre-filter above pixels
 2388|       |  // store in temp buffers:
 2389|       |  //   above[x] * 32 + 16
 2390|       |  //   above[x+1] - above[x]
 2391|       |  // final pixels will be calculated as:
 2392|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 2393|   252k|  __m128i c3f, min_base_y128;
 2394|   252k|  __m256i a0_x, a1_x, diff, a32, a16;
 2395|   252k|  __m128i a0_x128, a1_x128;
 2396|       |
 2397|   252k|  a16 = _mm256_set1_epi16(16);
 2398|   252k|  c3f = _mm_set1_epi16(0x3f);
 2399|   252k|  min_base_y128 = _mm_set1_epi16(min_base_y);
 2400|       |
 2401|  2.68M|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (2401:19): [True: 2.43M, False: 252k]
  ------------------
 2402|  2.43M|    __m256i b, res, shift;
 2403|  2.43M|    __m128i resx, resy, resxy;
 2404|  2.43M|    int y = r + 1;
 2405|  2.43M|    int base_x = (-y * dx) >> frac_bits_x;
 2406|  2.43M|    int base_shift = 0;
 2407|  2.43M|    if (base_x < (min_base_x - 1)) {
  ------------------
  |  Branch (2407:9): [True: 1.78M, False: 650k]
  ------------------
 2408|  1.78M|      base_shift = (min_base_x - base_x - 1) >> upsample_above;
 2409|  1.78M|    }
 2410|  2.43M|    int base_min_diff =
 2411|  2.43M|        (min_base_x - base_x + upsample_above) >> upsample_above;
 2412|  2.43M|    if (base_min_diff > 8) {
  ------------------
  |  Branch (2412:9): [True: 996k, False: 1.43M]
  ------------------
 2413|   996k|      base_min_diff = 8;
 2414|  1.43M|    } else {
 2415|  1.43M|      if (base_min_diff < 0) base_min_diff = 0;
  ------------------
  |  Branch (2415:11): [True: 0, False: 1.43M]
  ------------------
 2416|  1.43M|    }
 2417|       |
 2418|  2.43M|    if (base_shift > 7) {
  ------------------
  |  Branch (2418:9): [True: 996k, False: 1.43M]
  ------------------
 2419|   996k|      a0_x = _mm256_setzero_si256();
 2420|   996k|      a1_x = _mm256_setzero_si256();
 2421|   996k|      shift = _mm256_setzero_si256();
 2422|  1.43M|    } else {
 2423|  1.43M|      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
 2424|  1.43M|      if (upsample_above) {
  ------------------
  |  Branch (2424:11): [True: 448k, False: 987k]
  ------------------
 2425|   448k|        __m128i mask, atmp0, atmp1, atmp2, atmp3;
 2426|   448k|        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
 2427|   448k|        atmp0 = _mm_shuffle_epi8(a0_x128,
 2428|   448k|                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
 2429|   448k|        atmp1 = _mm_shuffle_epi8(a1_x128,
 2430|   448k|                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
 2431|   448k|        atmp2 = _mm_shuffle_epi8(
 2432|   448k|            a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
 2433|   448k|        atmp3 = _mm_shuffle_epi8(
 2434|   448k|            a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
 2435|   448k|        mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
 2436|   448k|                              _mm_set1_epi8(15));
 2437|   448k|        a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
 2438|   448k|        mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
 2439|   448k|                              _mm_set1_epi8(15));
 2440|   448k|        a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
 2441|       |
 2442|   448k|        shift = _mm256_castsi128_si256(_mm_srli_epi16(
 2443|   448k|            _mm_and_si128(
 2444|   448k|                _mm_slli_epi16(
 2445|   448k|                    _mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
 2446|   448k|                                   (2 << 6) - y * dx, (3 << 6) - y * dx,
 2447|   448k|                                   (4 << 6) - y * dx, (5 << 6) - y * dx,
 2448|   448k|                                   (6 << 6) - y * dx, (7 << 6) - y * dx),
 2449|   448k|                    upsample_above),
 2450|   448k|                c3f),
 2451|   448k|            1));
 2452|   987k|      } else {
 2453|   987k|        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
 2454|   987k|        a0_x128 =
 2455|   987k|            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
 2456|   987k|        a1_x128 =
 2457|   987k|            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
 2458|       |
 2459|   987k|        shift = _mm256_castsi128_si256(_mm_srli_epi16(
 2460|   987k|            _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
 2461|   987k|                                         (2 << 6) - y * dx, (3 << 6) - y * dx,
 2462|   987k|                                         (4 << 6) - y * dx, (5 << 6) - y * dx,
 2463|   987k|                                         (6 << 6) - y * dx, (7 << 6) - y * dx),
 2464|   987k|                          c3f),
 2465|   987k|            1));
 2466|   987k|      }
 2467|  1.43M|      a0_x = _mm256_castsi128_si256(a0_x128);
 2468|  1.43M|      a1_x = _mm256_castsi128_si256(a1_x128);
 2469|  1.43M|    }
 2470|       |
 2471|       |    // y calc
 2472|  2.43M|    __m128i a0_y, a1_y, shifty;
 2473|  2.43M|    if (base_x < min_base_x) {
  ------------------
  |  Branch (2473:9): [True: 2.00M, False: 428k]
  ------------------
 2474|  2.00M|      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
  ------------------
  |  |   19|  2.00M|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 2475|  2.00M|      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
 2476|  2.00M|      r6 = _mm_set1_epi16(r << 6);
 2477|  2.00M|      dy128 = _mm_set1_epi16(dy);
 2478|  2.00M|      c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
 2479|  2.00M|      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
 2480|  2.00M|      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
 2481|  2.00M|      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
 2482|  2.00M|      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
 2483|  2.00M|      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
 2484|       |
 2485|  2.00M|      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
 2486|  2.00M|                            left[base_y_c[2]], left[base_y_c[3]],
 2487|  2.00M|                            left[base_y_c[4]], left[base_y_c[5]],
 2488|  2.00M|                            left[base_y_c[6]], left[base_y_c[7]]);
 2489|  2.00M|      a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
 2490|  2.00M|                            left[base_y_c[2] + 1], left[base_y_c[3] + 1],
 2491|  2.00M|                            left[base_y_c[4] + 1], left[base_y_c[5] + 1],
 2492|  2.00M|                            left[base_y_c[6] + 1], left[base_y_c[7] + 1]);
 2493|       |
 2494|  2.00M|      if (upsample_left) {
  ------------------
  |  Branch (2494:11): [True: 548k, False: 1.45M]
  ------------------
 2495|   548k|        shifty = _mm_srli_epi16(
 2496|   548k|            _mm_and_si128(_mm_slli_epi16((y_c128), upsample_left), c3f), 1);
 2497|  1.45M|      } else {
 2498|  1.45M|        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
 2499|  1.45M|      }
 2500|  2.00M|      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
 2501|  2.00M|      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
 2502|  2.00M|      shift = _mm256_inserti128_si256(shift, shifty, 1);
 2503|  2.00M|    }
 2504|       |
 2505|  2.43M|    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
 2506|  2.43M|    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
 2507|  2.43M|    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
 2508|       |
 2509|  2.43M|    b = _mm256_mullo_epi16(diff, shift);
 2510|  2.43M|    res = _mm256_add_epi16(a32, b);
 2511|  2.43M|    res = _mm256_srli_epi16(res, 5);
 2512|       |
 2513|  2.43M|    resx = _mm256_castsi256_si128(res);
 2514|  2.43M|    resy = _mm256_extracti128_si256(res, 1);
 2515|       |
 2516|  2.43M|    resxy =
 2517|  2.43M|        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
 2518|  2.43M|    _mm_storeu_si128((__m128i *)(dst), resxy);
 2519|  2.43M|    dst += stride;
 2520|  2.43M|  }
 2521|   252k|}
intrapred_avx2.c:highbd_dr_prediction_32bit_z2_Nx8_avx2:
 2231|   282k|    int dy) {
 2232|   282k|  const int min_base_x = -(1 << upsample_above);
 2233|   282k|  const int min_base_y = -(1 << upsample_left);
 2234|   282k|  const int frac_bits_x = 6 - upsample_above;
 2235|   282k|  const int frac_bits_y = 6 - upsample_left;
 2236|       |
 2237|       |  // pre-filter above pixels
 2238|       |  // store in temp buffers:
 2239|       |  //   above[x] * 32 + 16
 2240|       |  //   above[x+1] - above[x]
 2241|       |  // final pixels will be calculated as:
 2242|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 2243|   282k|  __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c3f, min_base_y256;
 2244|   282k|  __m256i diff;
 2245|   282k|  __m128i a0_x128, a1_x128;
 2246|       |
 2247|   282k|  a16 = _mm256_set1_epi32(16);
 2248|   282k|  c3f = _mm256_set1_epi32(0x3f);
 2249|   282k|  min_base_y256 = _mm256_set1_epi32(min_base_y);
 2250|       |
 2251|  2.99M|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (2251:19): [True: 2.71M, False: 282k]
  ------------------
 2252|  2.71M|    __m256i b, res, shift;
 2253|  2.71M|    __m128i resx, resy, resxy;
 2254|  2.71M|    int y = r + 1;
 2255|  2.71M|    int base_x = (-y * dx) >> frac_bits_x;
 2256|  2.71M|    int base_shift = 0;
 2257|  2.71M|    if (base_x < (min_base_x - 1)) {
  ------------------
  |  Branch (2257:9): [True: 2.13M, False: 575k]
  ------------------
 2258|  2.13M|      base_shift = (min_base_x - base_x - 1) >> upsample_above;
 2259|  2.13M|    }
 2260|  2.71M|    int base_min_diff =
 2261|  2.71M|        (min_base_x - base_x + upsample_above) >> upsample_above;
 2262|  2.71M|    if (base_min_diff > 8) {
  ------------------
  |  Branch (2262:9): [True: 1.37M, False: 1.33M]
  ------------------
 2263|  1.37M|      base_min_diff = 8;
 2264|  1.37M|    } else {
 2265|  1.33M|      if (base_min_diff < 0) base_min_diff = 0;
  ------------------
  |  Branch (2265:11): [True: 0, False: 1.33M]
  ------------------
 2266|  1.33M|    }
 2267|       |
 2268|  2.71M|    if (base_shift > 7) {
  ------------------
  |  Branch (2268:9): [True: 1.37M, False: 1.33M]
  ------------------
 2269|  1.37M|      resx = _mm_setzero_si128();
 2270|  1.37M|    } else {
 2271|  1.33M|      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
 2272|  1.33M|      if (upsample_above) {
  ------------------
  |  Branch (2272:11): [True: 69.4k, False: 1.26M]
  ------------------
 2273|  69.4k|        __m128i mask, atmp0, atmp1, atmp2, atmp3;
 2274|  69.4k|        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
 2275|  69.4k|        atmp0 = _mm_shuffle_epi8(a0_x128,
 2276|  69.4k|                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
 2277|  69.4k|        atmp1 = _mm_shuffle_epi8(a1_x128,
 2278|  69.4k|                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
 2279|  69.4k|        atmp2 = _mm_shuffle_epi8(
 2280|  69.4k|            a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
 2281|  69.4k|        atmp3 = _mm_shuffle_epi8(
 2282|  69.4k|            a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
 2283|  69.4k|        mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
 2284|  69.4k|                              _mm_set1_epi8(15));
 2285|  69.4k|        a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
 2286|  69.4k|        mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
 2287|  69.4k|                              _mm_set1_epi8(15));
 2288|  69.4k|        a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
 2289|  69.4k|        shift = _mm256_srli_epi32(
 2290|  69.4k|            _mm256_and_si256(
 2291|  69.4k|                _mm256_slli_epi32(
 2292|  69.4k|                    _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx,
 2293|  69.4k|                                      (2 << 6) - y * dx, (3 << 6) - y * dx,
 2294|  69.4k|                                      (4 << 6) - y * dx, (5 << 6) - y * dx,
 2295|  69.4k|                                      (6 << 6) - y * dx, (7 << 6) - y * dx),
 2296|  69.4k|                    upsample_above),
 2297|  69.4k|                c3f),
 2298|  69.4k|            1);
 2299|  1.26M|      } else {
 2300|  1.26M|        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
 2301|  1.26M|        a0_x128 =
 2302|  1.26M|            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
 2303|  1.26M|        a1_x128 =
 2304|  1.26M|            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
 2305|       |
 2306|  1.26M|        shift = _mm256_srli_epi32(
 2307|  1.26M|            _mm256_and_si256(
 2308|  1.26M|                _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
 2309|  1.26M|                                  (3 << 6) - y * dx, (4 << 6) - y * dx,
 2310|  1.26M|                                  (5 << 6) - y * dx, (6 << 6) - y * dx,
 2311|  1.26M|                                  (7 << 6) - y * dx),
 2312|  1.26M|                c3f),
 2313|  1.26M|            1);
 2314|  1.26M|      }
 2315|  1.33M|      a0_x = _mm256_cvtepu16_epi32(a0_x128);
 2316|  1.33M|      a1_x = _mm256_cvtepu16_epi32(a1_x128);
 2317|       |
 2318|  1.33M|      diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
 2319|  1.33M|      a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
 2320|  1.33M|      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
 2321|       |
 2322|  1.33M|      b = _mm256_mullo_epi32(diff, shift);
 2323|  1.33M|      res = _mm256_add_epi32(a32, b);
 2324|  1.33M|      res = _mm256_srli_epi32(res, 5);
 2325|       |
 2326|  1.33M|      resx = _mm256_castsi256_si128(_mm256_packus_epi32(
 2327|  1.33M|          res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
 2328|  1.33M|    }
 2329|       |    // y calc
 2330|  2.71M|    if (base_x < min_base_x) {
  ------------------
  |  Branch (2330:9): [True: 2.34M, False: 371k]
  ------------------
 2331|  2.34M|      DECLARE_ALIGNED(32, int, base_y_c[8]);
  ------------------
  |  |   19|  2.34M|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 2332|  2.34M|      __m256i r6, c256, dy256, y_c256, base_y_c256, mask256;
 2333|  2.34M|      r6 = _mm256_set1_epi32(r << 6);
 2334|  2.34M|      dy256 = _mm256_set1_epi32(dy);
 2335|  2.34M|      c256 = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
 2336|  2.34M|      y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
 2337|  2.34M|      base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
 2338|  2.34M|      mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
 2339|  2.34M|      base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
 2340|  2.34M|      _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
 2341|       |
 2342|  2.34M|      a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
 2343|  2.34M|          left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
 2344|  2.34M|          left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
 2345|  2.34M|          left[base_y_c[6]], left[base_y_c[7]]));
 2346|  2.34M|      a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
 2347|  2.34M|          left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
 2348|  2.34M|          left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
 2349|  2.34M|          left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
 2350|       |
 2351|  2.34M|      if (upsample_left) {
  ------------------
  |  Branch (2351:11): [True: 120k, False: 2.21M]
  ------------------
 2352|   120k|        shift = _mm256_srli_epi32(
 2353|   120k|            _mm256_and_si256(_mm256_slli_epi32((y_c256), upsample_left), c3f),
 2354|   120k|            1);
 2355|  2.21M|      } else {
 2356|  2.21M|        shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
 2357|  2.21M|      }
 2358|  2.34M|      diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
 2359|  2.34M|      a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
 2360|  2.34M|      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
 2361|       |
 2362|  2.34M|      b = _mm256_mullo_epi32(diff, shift);
 2363|  2.34M|      res = _mm256_add_epi32(a32, b);
 2364|  2.34M|      res = _mm256_srli_epi32(res, 5);
 2365|       |
 2366|  2.34M|      resy = _mm256_castsi256_si128(_mm256_packus_epi32(
 2367|  2.34M|          res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
 2368|  2.34M|    } else {
 2369|   371k|      resy = resx;
 2370|   371k|    }
 2371|  2.71M|    resxy =
 2372|  2.71M|        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
 2373|  2.71M|    _mm_storeu_si128((__m128i *)(dst), resxy);
 2374|  2.71M|    dst += stride;
 2375|  2.71M|  }
 2376|   282k|}
intrapred_avx2.c:highbd_dr_prediction_z2_HxW_avx2:
 2722|   420k|    int dy) {
 2723|       |  // here upsample_above and upsample_left are 0 by design of
 2724|       |  // av1_use_intra_edge_upsample
 2725|   420k|  const int min_base_x = -1;
 2726|   420k|  const int min_base_y = -1;
 2727|   420k|  (void)upsample_above;
 2728|   420k|  (void)upsample_left;
 2729|   420k|  const int frac_bits_x = 6;
 2730|   420k|  const int frac_bits_y = 6;
 2731|       |
 2732|       |  // pre-filter above pixels
 2733|       |  // store in temp buffers:
 2734|       |  //   above[x] * 32 + 16
 2735|       |  //   above[x+1] - above[x]
 2736|       |  // final pixels will be calculated as:
 2737|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 2738|   420k|  __m256i a0_x, a1_x, a32, a16, c3f, c1;
 2739|   420k|  __m256i diff, min_base_y256, dy256, c1234, c0123;
 2740|   420k|  DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
  ------------------
  |  |   19|   420k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 2741|       |
 2742|   420k|  a16 = _mm256_set1_epi16(16);
 2743|   420k|  c1 = _mm256_srli_epi16(a16, 4);
 2744|   420k|  min_base_y256 = _mm256_set1_epi16(min_base_y);
 2745|   420k|  c3f = _mm256_set1_epi16(0x3f);
 2746|   420k|  dy256 = _mm256_set1_epi16(dy);
 2747|   420k|  c0123 =
 2748|   420k|      _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 2749|   420k|  c1234 = _mm256_add_epi16(c0123, c1);
 2750|       |
 2751|  8.12M|  for (int r = 0; r < H; r++) {
  ------------------
  |  Branch (2751:19): [True: 7.70M, False: 420k]
  ------------------
 2752|  7.70M|    __m256i b, res, shift;
 2753|  7.70M|    __m256i resx, resy, ydx;
 2754|  7.70M|    __m256i resxy, j256, r6;
 2755|  7.70M|    __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
 2756|  7.70M|    int y = r + 1;
 2757|  7.70M|    ydx = _mm256_set1_epi16((short)(y * dx));
 2758|       |
 2759|  21.0M|    for (int j = 0; j < W; j += 16) {
  ------------------
  |  Branch (2759:21): [True: 13.3M, False: 7.70M]
  ------------------
 2760|  13.3M|      j256 = _mm256_set1_epi16(j);
 2761|  13.3M|      int base_x = ((j << 6) - y * dx) >> frac_bits_x;
 2762|  13.3M|      int base_shift = 0;
 2763|  13.3M|      if ((base_x) < (min_base_x - 1)) {
  ------------------
  |  Branch (2763:11): [True: 9.93M, False: 3.45M]
  ------------------
 2764|  9.93M|        base_shift = (min_base_x - (base_x)-1);
 2765|  9.93M|      }
 2766|  13.3M|      int base_min_diff = (min_base_x - base_x);
 2767|  13.3M|      if (base_min_diff > 16) {
  ------------------
  |  Branch (2767:11): [True: 7.23M, False: 6.15M]
  ------------------
 2768|  7.23M|        base_min_diff = 16;
 2769|  7.23M|      } else {
 2770|  6.15M|        if (base_min_diff < 0) base_min_diff = 0;
  ------------------
  |  Branch (2770:13): [True: 2.20M, False: 3.95M]
  ------------------
 2771|  6.15M|      }
 2772|       |
 2773|  13.3M|      if (base_shift < 8) {
  ------------------
  |  Branch (2773:11): [True: 5.12M, False: 8.26M]
  ------------------
 2774|  5.12M|        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
 2775|  5.12M|        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
 2776|  5.12M|        a0_x128 =
 2777|  5.12M|            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
 2778|  5.12M|        a1_x128 =
 2779|  5.12M|            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
 2780|       |
 2781|  5.12M|        a0_x = _mm256_castsi128_si256(a0_x128);
 2782|  5.12M|        a1_x = _mm256_castsi128_si256(a1_x128);
 2783|  8.26M|      } else {
 2784|  8.26M|        a0_x = _mm256_setzero_si256();
 2785|  8.26M|        a1_x = _mm256_setzero_si256();
 2786|  8.26M|      }
 2787|       |
 2788|  13.3M|      int base_shift1 = 0;
 2789|  13.3M|      if (base_shift > 8) {
  ------------------
  |  Branch (2789:11): [True: 8.10M, False: 5.28M]
  ------------------
 2790|  8.10M|        base_shift1 = base_shift - 8;
 2791|  8.10M|      }
 2792|  13.3M|      if (base_shift1 < 8) {
  ------------------
  |  Branch (2792:11): [True: 6.15M, False: 7.23M]
  ------------------
 2793|  6.15M|        a0_1_x128 =
 2794|  6.15M|            _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 8));
 2795|  6.15M|        a1_1_x128 =
 2796|  6.15M|            _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 9));
 2797|  6.15M|        a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
 2798|  6.15M|                                     *(__m128i *)HighbdLoadMaskx[base_shift1]);
 2799|  6.15M|        a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
 2800|  6.15M|                                     *(__m128i *)HighbdLoadMaskx[base_shift1]);
 2801|       |
 2802|  6.15M|        a0_x = _mm256_inserti128_si256(a0_x, a0_1_x128, 1);
 2803|  6.15M|        a1_x = _mm256_inserti128_si256(a1_x, a1_1_x128, 1);
 2804|  6.15M|      }
 2805|  13.3M|      r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
 2806|  13.3M|      shift = _mm256_srli_epi16(
 2807|  13.3M|          _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
 2808|       |
 2809|  13.3M|      diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
 2810|  13.3M|      a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
 2811|  13.3M|      a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
 2812|       |
 2813|  13.3M|      b = _mm256_mullo_epi16(diff, shift);
 2814|  13.3M|      res = _mm256_add_epi16(a32, b);
 2815|  13.3M|      resx = _mm256_srli_epi16(res, 5);  // 16 16-bit values
 2816|       |
 2817|       |      // y calc
 2818|  13.3M|      resy = _mm256_setzero_si256();
 2819|  13.3M|      __m256i a0_y, a1_y, shifty;
 2820|  13.3M|      if ((base_x < min_base_x)) {
  ------------------
  |  Branch (2820:11): [True: 10.4M, False: 2.92M]
  ------------------
 2821|  10.4M|        __m256i c256, y_c256, base_y_c256, mask256, mul16;
 2822|  10.4M|        r6 = _mm256_set1_epi16(r << 6);
 2823|  10.4M|        c256 = _mm256_add_epi16(j256, c1234);
 2824|  10.4M|        mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
 2825|  10.4M|                                 _mm256_srli_epi16(min_base_y256, 1));
 2826|  10.4M|        y_c256 = _mm256_sub_epi16(r6, mul16);
 2827|  10.4M|        base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
 2828|  10.4M|        mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
 2829|  10.4M|        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
 2830|  10.4M|        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
 2831|       |
 2832|  10.4M|        a0_y = _mm256_setr_epi16(
 2833|  10.4M|            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
 2834|  10.4M|            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
 2835|  10.4M|            left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
 2836|  10.4M|            left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
 2837|  10.4M|            left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
 2838|  10.4M|            left[base_y_c[15]]);
 2839|  10.4M|        base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
 2840|  10.4M|        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
 2841|       |
 2842|  10.4M|        a1_y = _mm256_setr_epi16(
 2843|  10.4M|            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
 2844|  10.4M|            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
 2845|  10.4M|            left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
 2846|  10.4M|            left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
 2847|  10.4M|            left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
 2848|  10.4M|            left[base_y_c[15]]);
 2849|       |
 2850|  10.4M|        shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
 2851|       |
 2852|  10.4M|        diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
 2853|  10.4M|        a32 = _mm256_slli_epi16(a0_y, 5);     // a[x] * 32
 2854|  10.4M|        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
 2855|       |
 2856|  10.4M|        b = _mm256_mullo_epi16(diff, shifty);
 2857|  10.4M|        res = _mm256_add_epi16(a32, b);
 2858|  10.4M|        resy = _mm256_srli_epi16(res, 5);
 2859|  10.4M|      }
 2860|       |
 2861|  13.3M|      resxy = _mm256_blendv_epi8(resx, resy,
 2862|  13.3M|                                 *(__m256i *)HighbdBaseMask[base_min_diff]);
 2863|  13.3M|      _mm256_storeu_si256((__m256i *)(dst + j), resxy);
 2864|  13.3M|    }  // for j
 2865|  7.70M|    dst += stride;
 2866|  7.70M|  }
 2867|   420k|}
intrapred_avx2.c:highbd_dr_prediction_32bit_z2_HxW_avx2:
 2526|   177k|    int dy) {
 2527|       |  // here upsample_above and upsample_left are 0 by design of
 2528|       |  // av1_use_intra_edge_upsample
 2529|   177k|  const int min_base_x = -1;
 2530|   177k|  const int min_base_y = -1;
 2531|   177k|  (void)upsample_above;
 2532|   177k|  (void)upsample_left;
 2533|   177k|  const int frac_bits_x = 6;
 2534|   177k|  const int frac_bits_y = 6;
 2535|       |
 2536|       |  // pre-filter above pixels
 2537|       |  // store in temp buffers:
 2538|       |  //   above[x] * 32 + 16
 2539|       |  //   above[x+1] - above[x]
 2540|       |  // final pixels will be calculated as:
 2541|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 2542|   177k|  __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16, c1;
 2543|   177k|  __m256i diff, min_base_y256, c3f, dy256, c1234, c0123, c8;
 2544|   177k|  __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
 2545|   177k|  DECLARE_ALIGNED(32, int, base_y_c[16]);
  ------------------
  |  |   19|   177k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 2546|       |
 2547|   177k|  a16 = _mm256_set1_epi32(16);
 2548|   177k|  c1 = _mm256_srli_epi32(a16, 4);
 2549|   177k|  c8 = _mm256_srli_epi32(a16, 1);
 2550|   177k|  min_base_y256 = _mm256_set1_epi32(min_base_y);
 2551|   177k|  c3f = _mm256_set1_epi32(0x3f);
 2552|   177k|  dy256 = _mm256_set1_epi32(dy);
 2553|   177k|  c0123 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
 2554|   177k|  c1234 = _mm256_add_epi32(c0123, c1);
 2555|       |
 2556|  2.68M|  for (int r = 0; r < H; r++) {
  ------------------
  |  Branch (2556:19): [True: 2.50M, False: 177k]
  ------------------
 2557|  2.50M|    __m256i b, res, shift, ydx;
 2558|  2.50M|    __m256i resx[2], resy[2];
 2559|  2.50M|    __m256i resxy, j256, r6;
 2560|  7.67M|    for (int j = 0; j < W; j += 16) {
  ------------------
  |  Branch (2560:21): [True: 5.16M, False: 2.50M]
  ------------------
 2561|  5.16M|      j256 = _mm256_set1_epi32(j);
 2562|  5.16M|      int y = r + 1;
 2563|  5.16M|      ydx = _mm256_set1_epi32(y * dx);
 2564|       |
 2565|  5.16M|      int base_x = ((j << 6) - y * dx) >> frac_bits_x;
 2566|  5.16M|      int base_shift = 0;
 2567|  5.16M|      if ((base_x) < (min_base_x - 1)) {
  ------------------
  |  Branch (2567:11): [True: 3.24M, False: 1.91M]
  ------------------
 2568|  3.24M|        base_shift = (min_base_x - base_x - 1);
 2569|  3.24M|      }
 2570|  5.16M|      int base_min_diff = (min_base_x - base_x);
 2571|  5.16M|      if (base_min_diff > 16) {
  ------------------
  |  Branch (2571:11): [True: 2.10M, False: 3.05M]
  ------------------
 2572|  2.10M|        base_min_diff = 16;
 2573|  3.05M|      } else {
 2574|  3.05M|        if (base_min_diff < 0) base_min_diff = 0;
  ------------------
  |  Branch (2574:13): [True: 1.29M, False: 1.75M]
  ------------------
 2575|  3.05M|      }
 2576|       |
 2577|  5.16M|      if (base_shift > 7) {
  ------------------
  |  Branch (2577:11): [True: 2.55M, False: 2.60M]
  ------------------
 2578|  2.55M|        resx[0] = _mm256_setzero_si256();
 2579|  2.60M|      } else {
 2580|  2.60M|        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
 2581|  2.60M|        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
 2582|  2.60M|        a0_x128 =
 2583|  2.60M|            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
 2584|  2.60M|        a1_x128 =
 2585|  2.60M|            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
 2586|       |
 2587|  2.60M|        a0_x = _mm256_cvtepu16_epi32(a0_x128);
 2588|  2.60M|        a1_x = _mm256_cvtepu16_epi32(a1_x128);
 2589|       |
 2590|  2.60M|        r6 = _mm256_slli_epi32(_mm256_add_epi32(c0123, j256), 6);
 2591|  2.60M|        shift = _mm256_srli_epi32(
 2592|  2.60M|            _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
 2593|       |
 2594|  2.60M|        diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
 2595|  2.60M|        a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
 2596|  2.60M|        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
 2597|       |
 2598|  2.60M|        b = _mm256_mullo_epi32(diff, shift);
 2599|  2.60M|        res = _mm256_add_epi32(a32, b);
 2600|  2.60M|        res = _mm256_srli_epi32(res, 5);
 2601|       |
 2602|  2.60M|        resx[0] = _mm256_packus_epi32(
 2603|  2.60M|            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
 2604|  2.60M|      }
 2605|  5.16M|      int base_shift8 = 0;
 2606|  5.16M|      if ((base_x + 8) < (min_base_x - 1)) {
  ------------------
  |  Branch (2606:11): [True: 2.50M, False: 2.66M]
  ------------------
 2607|  2.50M|        base_shift8 = (min_base_x - (base_x + 8) - 1);
 2608|  2.50M|      }
 2609|  5.16M|      if (base_shift8 > 7) {
  ------------------
  |  Branch (2609:11): [True: 2.10M, False: 3.05M]
  ------------------
 2610|  2.10M|        resx[1] = _mm256_setzero_si256();
 2611|  3.05M|      } else {
 2612|  3.05M|        a0_1_x128 =
 2613|  3.05M|            _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 8));
 2614|  3.05M|        a1_1_x128 =
 2615|  3.05M|            _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 9));
 2616|  3.05M|        a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
 2617|  3.05M|                                     *(__m128i *)HighbdLoadMaskx[base_shift8]);
 2618|  3.05M|        a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
 2619|  3.05M|                                     *(__m128i *)HighbdLoadMaskx[base_shift8]);
 2620|       |
 2621|  3.05M|        a0_1_x = _mm256_cvtepu16_epi32(a0_1_x128);
 2622|  3.05M|        a1_1_x = _mm256_cvtepu16_epi32(a1_1_x128);
 2623|       |
 2624|  3.05M|        r6 = _mm256_slli_epi32(
 2625|  3.05M|            _mm256_add_epi32(c0123, _mm256_add_epi32(j256, c8)), 6);
 2626|  3.05M|        shift = _mm256_srli_epi32(
 2627|  3.05M|            _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
 2628|       |
 2629|  3.05M|        diff = _mm256_sub_epi32(a1_1_x, a0_1_x);  // a[x+1] - a[x]
 2630|  3.05M|        a32 = _mm256_slli_epi32(a0_1_x, 5);       // a[x] * 32
 2631|  3.05M|        a32 = _mm256_add_epi32(a32, a16);         // a[x] * 32 + 16
 2632|  3.05M|        b = _mm256_mullo_epi32(diff, shift);
 2633|       |
 2634|  3.05M|        resx[1] = _mm256_add_epi32(a32, b);
 2635|  3.05M|        resx[1] = _mm256_srli_epi32(resx[1], 5);
 2636|  3.05M|        resx[1] = _mm256_packus_epi32(
 2637|  3.05M|            resx[1],
 2638|  3.05M|            _mm256_castsi128_si256(_mm256_extracti128_si256(resx[1], 1)));
 2639|  3.05M|      }
 2640|  5.16M|      resx[0] =
 2641|  5.16M|          _mm256_inserti128_si256(resx[0], _mm256_castsi256_si128(resx[1]),
 2642|  5.16M|                                  1);  // 16 16bit values
 2643|       |
 2644|       |      // y calc
 2645|  5.16M|      resy[0] = _mm256_setzero_si256();
 2646|  5.16M|      if ((base_x < min_base_x)) {
  ------------------
  |  Branch (2646:11): [True: 3.42M, False: 1.74M]
  ------------------
 2647|  3.42M|        __m256i c256, y_c256, y_c_1_256, base_y_c256, mask256;
 2648|  3.42M|        r6 = _mm256_set1_epi32(r << 6);
 2649|  3.42M|        c256 = _mm256_add_epi32(j256, c1234);
 2650|  3.42M|        y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
 2651|  3.42M|        base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
 2652|  3.42M|        mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
 2653|  3.42M|        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
 2654|  3.42M|        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
 2655|  3.42M|        c256 = _mm256_add_epi32(c256, c8);
 2656|  3.42M|        y_c_1_256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
 2657|  3.42M|        base_y_c256 = _mm256_srai_epi32(y_c_1_256, frac_bits_y);
 2658|  3.42M|        mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
 2659|  3.42M|        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
 2660|  3.42M|        _mm256_store_si256((__m256i *)(base_y_c + 8), base_y_c256);
 2661|       |
 2662|  3.42M|        a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
 2663|  3.42M|            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
 2664|  3.42M|            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
 2665|  3.42M|            left[base_y_c[6]], left[base_y_c[7]]));
 2666|  3.42M|        a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
 2667|  3.42M|            left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
 2668|  3.42M|            left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
 2669|  3.42M|            left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
 2670|       |
 2671|  3.42M|        shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
 2672|       |
 2673|  3.42M|        diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
 2674|  3.42M|        a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
 2675|  3.42M|        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
 2676|       |
 2677|  3.42M|        b = _mm256_mullo_epi32(diff, shift);
 2678|  3.42M|        res = _mm256_add_epi32(a32, b);
 2679|  3.42M|        res = _mm256_srli_epi32(res, 5);
 2680|       |
 2681|  3.42M|        resy[0] = _mm256_packus_epi32(
 2682|  3.42M|            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
 2683|       |
 2684|  3.42M|        a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
 2685|  3.42M|            left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]],
 2686|  3.42M|            left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]],
 2687|  3.42M|            left[base_y_c[14]], left[base_y_c[15]]));
 2688|  3.42M|        a1_y = _mm256_cvtepu16_epi32(
 2689|  3.42M|            _mm_setr_epi16(left[base_y_c[8] + 1], left[base_y_c[9] + 1],
 2690|  3.42M|                           left[base_y_c[10] + 1], left[base_y_c[11] + 1],
 2691|  3.42M|                           left[base_y_c[12] + 1], left[base_y_c[13] + 1],
 2692|  3.42M|                           left[base_y_c[14] + 1], left[base_y_c[15] + 1]));
 2693|  3.42M|        shift = _mm256_srli_epi32(_mm256_and_si256(y_c_1_256, c3f), 1);
 2694|       |
 2695|  3.42M|        diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
 2696|  3.42M|        a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
 2697|  3.42M|        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
 2698|       |
 2699|  3.42M|        b = _mm256_mullo_epi32(diff, shift);
 2700|  3.42M|        res = _mm256_add_epi32(a32, b);
 2701|  3.42M|        res = _mm256_srli_epi32(res, 5);
 2702|       |
 2703|  3.42M|        resy[1] = _mm256_packus_epi32(
 2704|  3.42M|            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
 2705|       |
 2706|  3.42M|        resy[0] =
 2707|  3.42M|            _mm256_inserti128_si256(resy[0], _mm256_castsi256_si128(resy[1]),
 2708|  3.42M|                                    1);  // 16 16bit values
 2709|  3.42M|      }
 2710|       |
 2711|  5.16M|      resxy = _mm256_blendv_epi8(resx[0], resy[0],
 2712|  5.16M|                                 *(__m256i *)HighbdBaseMask[base_min_diff]);
 2713|  5.16M|      _mm256_storeu_si256((__m256i *)(dst + j), resxy);
 2714|  5.16M|    }  // for j
 2715|  2.50M|    dst += stride;
 2716|  2.50M|  }
 2717|   177k|}
intrapred_avx2.c:highbd_dr_prediction_z3_4x4_avx2:
 2916|   225k|                                             int bd) {
 2917|   225k|  __m128i dstvec[4], d[4];
 2918|   225k|  if (bd < 12) {
  ------------------
  |  Branch (2918:7): [True: 189k, False: 36.4k]
  ------------------
 2919|   189k|    highbd_dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left,
 2920|   189k|                                              dy);
 2921|   189k|  } else {
 2922|  36.4k|    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(4, dstvec, left,
 2923|  36.4k|                                                    upsample_left, dy);
 2924|  36.4k|  }
 2925|   225k|  highbd_transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2],
 2926|   225k|                                   &dstvec[3], &d[0], &d[1], &d[2], &d[3]);
 2927|   225k|  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
 2928|   225k|  _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
 2929|   225k|  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
 2930|   225k|  _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
 2931|   225k|  return;
 2932|   225k|}
intrapred_avx2.c:highbd_dr_prediction_z3_8x8_avx2:
 2937|   210k|                                             int bd) {
 2938|   210k|  __m128i dstvec[8], d[8];
 2939|   210k|  if (bd < 12) {
  ------------------
  |  Branch (2939:7): [True: 110k, False: 100k]
  ------------------
 2940|   110k|    highbd_dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left,
 2941|   110k|                                              dy);
 2942|   110k|  } else {
 2943|   100k|    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(8, dstvec, left,
 2944|   100k|                                                    upsample_left, dy);
 2945|   100k|  }
 2946|   210k|  highbd_transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
 2947|   210k|                           &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
 2948|   210k|                           &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
 2949|   210k|                           &d[7]);
 2950|  1.89M|  for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (2950:19): [True: 1.68M, False: 210k]
  ------------------
 2951|  1.68M|    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
 2952|  1.68M|  }
 2953|   210k|}
intrapred_avx2.c:highbd_dr_prediction_z3_16x16_avx2:
 3164|   112k|                                               int bd) {
 3165|   112k|  __m256i dstvec[16], d[16];
 3166|   112k|  if (bd < 12) {
  ------------------
  |  Branch (3166:7): [True: 94.1k, False: 18.2k]
  ------------------
 3167|  94.1k|    highbd_dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left,
 3168|  94.1k|                                               dy);
 3169|  94.1k|  } else {
 3170|  18.2k|    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(16, dstvec, left,
 3171|  18.2k|                                                     upsample_left, dy);
 3172|  18.2k|  }
 3173|       |
 3174|   112k|  highbd_transpose16x16_avx2(dstvec, d);
 3175|       |
 3176|  1.90M|  for (int i = 0; i < 16; i++) {
  ------------------
  |  Branch (3176:19): [True: 1.79M, False: 112k]
  ------------------
 3177|  1.79M|    _mm256_storeu_si256((__m256i *)(dst + i * stride), d[i]);
 3178|  1.79M|  }
 3179|   112k|}
intrapred_avx2.c:highbd_transpose16x16_avx2:
  243|  1.17M|static inline void highbd_transpose16x16_avx2(__m256i *x, __m256i *d) {
  244|  1.17M|  __m256i w0, w1, w2, w3, ww0, ww1;
  245|  1.17M|  __m256i dd[16];
  246|  1.17M|  w0 = _mm256_unpacklo_epi16(x[0], x[1]);
  247|  1.17M|  w1 = _mm256_unpacklo_epi16(x[2], x[3]);
  248|  1.17M|  w2 = _mm256_unpacklo_epi16(x[4], x[5]);
  249|  1.17M|  w3 = _mm256_unpacklo_epi16(x[6], x[7]);
  250|       |
  251|  1.17M|  ww0 = _mm256_unpacklo_epi32(w0, w1);  //
  252|  1.17M|  ww1 = _mm256_unpacklo_epi32(w2, w3);  //
  253|       |
  254|  1.17M|  dd[0] = _mm256_unpacklo_epi64(ww0, ww1);
  255|  1.17M|  dd[1] = _mm256_unpackhi_epi64(ww0, ww1);
  256|       |
  257|  1.17M|  ww0 = _mm256_unpackhi_epi32(w0, w1);  //
  258|  1.17M|  ww1 = _mm256_unpackhi_epi32(w2, w3);  //
  259|       |
  260|  1.17M|  dd[2] = _mm256_unpacklo_epi64(ww0, ww1);
  261|  1.17M|  dd[3] = _mm256_unpackhi_epi64(ww0, ww1);
  262|       |
  263|  1.17M|  w0 = _mm256_unpackhi_epi16(x[0], x[1]);
  264|  1.17M|  w1 = _mm256_unpackhi_epi16(x[2], x[3]);
  265|  1.17M|  w2 = _mm256_unpackhi_epi16(x[4], x[5]);
  266|  1.17M|  w3 = _mm256_unpackhi_epi16(x[6], x[7]);
  267|       |
  268|  1.17M|  ww0 = _mm256_unpacklo_epi32(w0, w1);  //
  269|  1.17M|  ww1 = _mm256_unpacklo_epi32(w2, w3);  //
  270|       |
  271|  1.17M|  dd[4] = _mm256_unpacklo_epi64(ww0, ww1);
  272|  1.17M|  dd[5] = _mm256_unpackhi_epi64(ww0, ww1);
  273|       |
  274|  1.17M|  ww0 = _mm256_unpackhi_epi32(w0, w1);  //
  275|  1.17M|  ww1 = _mm256_unpackhi_epi32(w2, w3);  //
  276|       |
  277|  1.17M|  dd[6] = _mm256_unpacklo_epi64(ww0, ww1);
  278|  1.17M|  dd[7] = _mm256_unpackhi_epi64(ww0, ww1);
  279|       |
  280|  1.17M|  w0 = _mm256_unpacklo_epi16(x[8], x[9]);
  281|  1.17M|  w1 = _mm256_unpacklo_epi16(x[10], x[11]);
  282|  1.17M|  w2 = _mm256_unpacklo_epi16(x[12], x[13]);
  283|  1.17M|  w3 = _mm256_unpacklo_epi16(x[14], x[15]);
  284|       |
  285|  1.17M|  ww0 = _mm256_unpacklo_epi32(w0, w1);
  286|  1.17M|  ww1 = _mm256_unpacklo_epi32(w2, w3);
  287|       |
  288|  1.17M|  dd[8] = _mm256_unpacklo_epi64(ww0, ww1);
  289|  1.17M|  dd[9] = _mm256_unpackhi_epi64(ww0, ww1);
  290|       |
  291|  1.17M|  ww0 = _mm256_unpackhi_epi32(w0, w1);
  292|  1.17M|  ww1 = _mm256_unpackhi_epi32(w2, w3);
  293|       |
  294|  1.17M|  dd[10] = _mm256_unpacklo_epi64(ww0, ww1);
  295|  1.17M|  dd[11] = _mm256_unpackhi_epi64(ww0, ww1);
  296|       |
  297|  1.17M|  w0 = _mm256_unpackhi_epi16(x[8], x[9]);
  298|  1.17M|  w1 = _mm256_unpackhi_epi16(x[10], x[11]);
  299|  1.17M|  w2 = _mm256_unpackhi_epi16(x[12], x[13]);
  300|  1.17M|  w3 = _mm256_unpackhi_epi16(x[14], x[15]);
  301|       |
  302|  1.17M|  ww0 = _mm256_unpacklo_epi32(w0, w1);
  303|  1.17M|  ww1 = _mm256_unpacklo_epi32(w2, w3);
  304|       |
  305|  1.17M|  dd[12] = _mm256_unpacklo_epi64(ww0, ww1);
  306|  1.17M|  dd[13] = _mm256_unpackhi_epi64(ww0, ww1);
  307|       |
  308|  1.17M|  ww0 = _mm256_unpackhi_epi32(w0, w1);
  309|  1.17M|  ww1 = _mm256_unpackhi_epi32(w2, w3);
  310|       |
  311|  1.17M|  dd[14] = _mm256_unpacklo_epi64(ww0, ww1);
  312|  1.17M|  dd[15] = _mm256_unpackhi_epi64(ww0, ww1);
  313|       |
  314|  10.5M|  for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (314:19): [True: 9.41M, False: 1.17M]
  ------------------
  315|  9.41M|    d[i] = _mm256_insertf128_si256(dd[i], _mm256_castsi256_si128(dd[i + 8]), 1);
  316|  9.41M|    d[i + 8] = _mm256_insertf128_si256(dd[i + 8],
  317|  9.41M|                                       _mm256_extracti128_si256(dd[i], 1), 0);
  318|  9.41M|  }
  319|  1.17M|}
intrapred_avx2.c:highbd_dr_prediction_z3_32x32_avx2:
 3184|  94.8k|                                               int bd) {
 3185|  94.8k|  __m256i dstvec[64], d[16];
 3186|  94.8k|  if (bd < 12) {
  ------------------
  |  Branch (3186:7): [True: 89.8k, False: 5.02k]
  ------------------
 3187|  89.8k|    highbd_dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left,
 3188|  89.8k|                                               dy);
 3189|  89.8k|  } else {
 3190|  5.02k|    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(32, dstvec, left,
 3191|  5.02k|                                                     upsample_left, dy);
 3192|  5.02k|  }
 3193|  94.8k|  highbd_transpose16x16_avx2(dstvec, d);
 3194|  1.61M|  for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (3194:19): [True: 1.51M, False: 94.8k]
  ------------------
 3195|  1.51M|    _mm256_storeu_si256((__m256i *)(dst + j * stride), d[j]);
 3196|  1.51M|  }
 3197|  94.8k|  highbd_transpose16x16_avx2(dstvec + 16, d);
 3198|  1.61M|  for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (3198:19): [True: 1.51M, False: 94.8k]
  ------------------
 3199|  1.51M|    _mm256_storeu_si256((__m256i *)(dst + j * stride + 16), d[j]);
 3200|  1.51M|  }
 3201|  94.8k|  highbd_transpose16x16_avx2(dstvec + 32, d);
 3202|  1.61M|  for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (3202:19): [True: 1.51M, False: 94.8k]
  ------------------
 3203|  1.51M|    _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride), d[j]);
 3204|  1.51M|  }
 3205|  94.8k|  highbd_transpose16x16_avx2(dstvec + 48, d);
 3206|  1.61M|  for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (3206:19): [True: 1.51M, False: 94.8k]
  ------------------
 3207|  1.51M|    _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride + 16), d[j]);
 3208|  1.51M|  }
 3209|  94.8k|}
intrapred_avx2.c:highbd_dr_prediction_z3_64x64_avx2:
 3214|  30.9k|                                               int bd) {
 3215|  30.9k|  DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]);
  ------------------
  |  |   19|  30.9k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 3216|  30.9k|  if (bd < 12) {
  ------------------
  |  Branch (3216:7): [True: 26.9k, False: 4.01k]
  ------------------
 3217|  26.9k|    highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
 3218|  26.9k|  } else {
 3219|  4.01k|    highbd_dr_prediction_32bit_z1_64xN_avx2(64, dstT, 64, left, upsample_left,
 3220|  4.01k|                                            dy);
 3221|  4.01k|  }
 3222|  30.9k|  highbd_transpose(dstT, 64, dst, stride, 64, 64);
 3223|  30.9k|}
intrapred_avx2.c:highbd_transpose:
 1971|  40.9k|                             int height) {
 1972|   198k|  for (int j = 0; j < height; j += 16)
  ------------------
  |  Branch (1972:19): [True: 157k, False: 40.9k]
  ------------------
 1973|   713k|    for (int i = 0; i < width; i += 16)
  ------------------
  |  Branch (1973:21): [True: 555k, False: 157k]
  ------------------
 1974|   555k|      highbd_transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
 1975|   555k|                                dst + j * pitchDst + i, pitchDst);
 1976|  40.9k|}
intrapred_avx2.c:highbd_transpose_TX_16X16:
 1957|   555k|                                      uint16_t *dst, ptrdiff_t pitchDst) {
 1958|   555k|  __m256i r[16];
 1959|   555k|  __m256i d[16];
 1960|  9.43M|  for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (1960:19): [True: 8.88M, False: 555k]
  ------------------
 1961|  8.88M|    r[j] = _mm256_loadu_si256((__m256i *)(src + j * pitchSrc));
 1962|  8.88M|  }
 1963|   555k|  highbd_transpose16x16_avx2(r, d);
 1964|  9.43M|  for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (1964:19): [True: 8.88M, False: 555k]
  ------------------
 1965|  8.88M|    _mm256_storeu_si256((__m256i *)(dst + j * pitchDst), d[j]);
 1966|  8.88M|  }
 1967|   555k|}
intrapred_avx2.c:highbd_dr_prediction_z3_4x8_avx2:
 2958|  35.7k|                                             int bd) {
 2959|  35.7k|  __m128i dstvec[4], d[8];
 2960|  35.7k|  if (bd < 12) {
  ------------------
  |  Branch (2960:7): [True: 19.7k, False: 15.9k]
  ------------------
 2961|  19.7k|    highbd_dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left,
 2962|  19.7k|                                              dy);
 2963|  19.7k|  } else {
 2964|  15.9k|    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(4, dstvec, left,
 2965|  15.9k|                                                    upsample_left, dy);
 2966|  15.9k|  }
 2967|       |
 2968|  35.7k|  highbd_transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
 2969|  35.7k|                               &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
 2970|  35.7k|                               &d[7]);
 2971|   321k|  for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (2971:19): [True: 286k, False: 35.7k]
  ------------------
 2972|   286k|    _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
 2973|   286k|  }
 2974|  35.7k|}
intrapred_avx2.c:highbd_dr_prediction_z3_8x16_avx2:
 3001|  42.5k|                                              int bd) {
 3002|  42.5k|  __m256i dstvec[8], d[8];
 3003|  42.5k|  if (bd < 12) {
  ------------------
  |  Branch (3003:7): [True: 28.4k, False: 14.1k]
  ------------------
 3004|  28.4k|    highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left,
 3005|  28.4k|                                               dy);
 3006|  28.4k|  } else {
 3007|  14.1k|    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(8, dstvec, left,
 3008|  14.1k|                                                     upsample_left, dy);
 3009|  14.1k|  }
 3010|  42.5k|  highbd_transpose8x16_16x8_avx2(dstvec, d);
 3011|   382k|  for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (3011:19): [True: 340k, False: 42.5k]
  ------------------
 3012|   340k|    _mm_storeu_si128((__m128i *)(dst + i * stride),
 3013|   340k|                     _mm256_castsi256_si128(d[i]));
 3014|   340k|  }
 3015|   382k|  for (int i = 8; i < 16; i++) {
  ------------------
  |  Branch (3015:19): [True: 340k, False: 42.5k]
  ------------------
 3016|   340k|    _mm_storeu_si128((__m128i *)(dst + i * stride),
 3017|   340k|                     _mm256_extracti128_si256(d[i - 8], 1));
 3018|   340k|  }
 3019|  42.5k|}
intrapred_avx2.c:highbd_transpose8x16_16x8_avx2:
  205|   172k|static inline void highbd_transpose8x16_16x8_avx2(__m256i *x, __m256i *d) {
  206|   172k|  __m256i w0, w1, w2, w3, ww0, ww1;
  207|       |
  208|   172k|  w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
  209|   172k|  w1 = _mm256_unpacklo_epi16(x[2], x[3]);  // 20 30 21 31 22 32 23 33
  210|   172k|  w2 = _mm256_unpacklo_epi16(x[4], x[5]);  // 40 50 41 51 42 52 43 53
  211|   172k|  w3 = _mm256_unpacklo_epi16(x[6], x[7]);  // 60 70 61 71 62 72 63 73
  212|       |
  213|   172k|  ww0 = _mm256_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
  214|   172k|  ww1 = _mm256_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
  215|       |
  216|   172k|  d[0] = _mm256_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
  217|   172k|  d[1] = _mm256_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
  218|       |
  219|   172k|  ww0 = _mm256_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
  220|   172k|  ww1 = _mm256_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
  221|       |
  222|   172k|  d[2] = _mm256_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
  223|   172k|  d[3] = _mm256_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
  224|       |
  225|   172k|  w0 = _mm256_unpackhi_epi16(x[0], x[1]);  // 04 14 05 15 06 16 07 17
  226|   172k|  w1 = _mm256_unpackhi_epi16(x[2], x[3]);  // 24 34 25 35 26 36 27 37
  227|   172k|  w2 = _mm256_unpackhi_epi16(x[4], x[5]);  // 44 54 45 55 46 56 47 57
  228|   172k|  w3 = _mm256_unpackhi_epi16(x[6], x[7]);  // 64 74 65 75 66 76 67 77
  229|       |
  230|   172k|  ww0 = _mm256_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
  231|   172k|  ww1 = _mm256_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
  232|       |
  233|   172k|  d[4] = _mm256_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
  234|   172k|  d[5] = _mm256_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
  235|       |
  236|   172k|  ww0 = _mm256_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
  237|   172k|  ww1 = _mm256_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
  238|       |
  239|   172k|  d[6] = _mm256_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
  240|   172k|  d[7] = _mm256_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
  241|   172k|}
intrapred_avx2.c:highbd_dr_prediction_z3_16x32_avx2:
 3228|  25.9k|                                               int bd) {
 3229|  25.9k|  __m256i dstvec[32], d[32];
 3230|  25.9k|  if (bd < 12) {
  ------------------
  |  Branch (3230:7): [True: 22.4k, False: 3.45k]
  ------------------
 3231|  22.4k|    highbd_dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left,
 3232|  22.4k|                                               dy);
 3233|  22.4k|  } else {
 3234|  3.45k|    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(16, dstvec, left,
 3235|  3.45k|                                                     upsample_left, dy);
 3236|  3.45k|  }
 3237|   129k|  for (int i = 0; i < 32; i += 8) {
  ------------------
  |  Branch (3237:19): [True: 103k, False: 25.9k]
  ------------------
 3238|   103k|    highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
 3239|   103k|  }
 3240|       |  // store
 3241|  77.7k|  for (int j = 0; j < 32; j += 16) {
  ------------------
  |  Branch (3241:19): [True: 51.8k, False: 25.9k]
  ------------------
 3242|   466k|    for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (3242:21): [True: 414k, False: 51.8k]
  ------------------
 3243|   414k|      _mm_storeu_si128((__m128i *)(dst + (i + j) * stride),
 3244|   414k|                       _mm256_castsi256_si128(d[(i + j)]));
 3245|   414k|    }
 3246|   466k|    for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (3246:21): [True: 414k, False: 51.8k]
  ------------------
 3247|   414k|      _mm_storeu_si128((__m128i *)(dst + (i + j) * stride + 8),
 3248|   414k|                       _mm256_castsi256_si128(d[(i + j) + 8]));
 3249|   414k|    }
 3250|   466k|    for (int i = 8; i < 16; i++) {
  ------------------
  |  Branch (3250:21): [True: 414k, False: 51.8k]
  ------------------
 3251|   414k|      _mm256_storeu_si256(
 3252|   414k|          (__m256i *)(dst + (i + j) * stride),
 3253|   414k|          _mm256_inserti128_si256(
 3254|   414k|              d[(i + j)], _mm256_extracti128_si256(d[(i + j) - 8], 1), 0));
 3255|   414k|    }
 3256|  51.8k|  }
 3257|  25.9k|}
intrapred_avx2.c:highbd_dr_prediction_z3_32x64_avx2:
 3282|  2.18k|                                               int bd) {
 3283|  2.18k|  uint16_t dstT[64 * 32];
 3284|  2.18k|  if (bd < 12) {
  ------------------
  |  Branch (3284:7): [True: 1.74k, False: 446]
  ------------------
 3285|  1.74k|    highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
 3286|  1.74k|  } else {
 3287|    446|    highbd_dr_prediction_32bit_z1_64xN_avx2(32, dstT, 64, left, upsample_left,
 3288|    446|                                            dy);
 3289|    446|  }
 3290|  2.18k|  highbd_transpose(dstT, 64, dst, stride, 32, 64);
 3291|  2.18k|}
intrapred_avx2.c:highbd_dr_prediction_z3_4x16_avx2:
 3050|  24.1k|                                              int bd) {
 3051|  24.1k|  __m256i dstvec[4], d[4], d1;
 3052|  24.1k|  if (bd < 12) {
  ------------------
  |  Branch (3052:7): [True: 15.9k, False: 8.21k]
  ------------------
 3053|  15.9k|    highbd_dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left,
 3054|  15.9k|                                               dy);
 3055|  15.9k|  } else {
 3056|  8.21k|    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(4, dstvec, left,
 3057|  8.21k|                                                     upsample_left, dy);
 3058|  8.21k|  }
 3059|  24.1k|  highbd_transpose4x16_avx2(dstvec, d);
 3060|   120k|  for (int i = 0; i < 4; i++) {
  ------------------
  |  Branch (3060:19): [True: 96.5k, False: 24.1k]
  ------------------
 3061|  96.5k|    _mm_storel_epi64((__m128i *)(dst + i * stride),
 3062|  96.5k|                     _mm256_castsi256_si128(d[i]));
 3063|  96.5k|    d1 = _mm256_bsrli_epi128(d[i], 8);
 3064|  96.5k|    _mm_storel_epi64((__m128i *)(dst + (i + 4) * stride),
 3065|  96.5k|                     _mm256_castsi256_si128(d1));
 3066|  96.5k|    _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
 3067|  96.5k|                     _mm256_extracti128_si256(d[i], 1));
 3068|  96.5k|    _mm_storel_epi64((__m128i *)(dst + (i + 12) * stride),
 3069|  96.5k|                     _mm256_extracti128_si256(d1, 1));
 3070|  96.5k|  }
 3071|  24.1k|}
intrapred_avx2.c:highbd_transpose4x16_avx2:
  183|  24.1k|static inline void highbd_transpose4x16_avx2(__m256i *x, __m256i *d) {
  184|  24.1k|  __m256i w0, w1, w2, w3, ww0, ww1;
  185|       |
  186|  24.1k|  w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
  187|  24.1k|  w1 = _mm256_unpacklo_epi16(x[2], x[3]);  // 20 30 21 31 22 32 23 33
  188|  24.1k|  w2 = _mm256_unpackhi_epi16(x[0], x[1]);  // 40 50 41 51 42 52 43 53
  189|  24.1k|  w3 = _mm256_unpackhi_epi16(x[2], x[3]);  // 60 70 61 71 62 72 63 73
  190|       |
  191|  24.1k|  ww0 = _mm256_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
  192|  24.1k|  ww1 = _mm256_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
  193|       |
  194|  24.1k|  d[0] = _mm256_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
  195|  24.1k|  d[1] = _mm256_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
  196|       |
  197|  24.1k|  ww0 = _mm256_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
  198|  24.1k|  ww1 = _mm256_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
  199|       |
  200|  24.1k|  d[2] = _mm256_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
  201|  24.1k|  d[3] = _mm256_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
  202|  24.1k|}
intrapred_avx2.c:highbd_dr_prediction_z3_8x32_avx2:
 3100|  13.1k|                                              int bd) {
 3101|  13.1k|  __m256i dstvec[16], d[16];
 3102|  13.1k|  if (bd < 12) {
  ------------------
  |  Branch (3102:7): [True: 10.9k, False: 2.21k]
  ------------------
 3103|  10.9k|    highbd_dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left,
 3104|  10.9k|                                               dy);
 3105|  10.9k|  } else {
 3106|  2.21k|    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(8, dstvec, left,
 3107|  2.21k|                                                     upsample_left, dy);
 3108|  2.21k|  }
 3109|       |
 3110|  39.4k|  for (int i = 0; i < 16; i += 8) {
  ------------------
  |  Branch (3110:19): [True: 26.2k, False: 13.1k]
  ------------------
 3111|  26.2k|    highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
 3112|  26.2k|  }
 3113|       |
 3114|   118k|  for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (3114:19): [True: 105k, False: 13.1k]
  ------------------
 3115|   105k|    _mm_storeu_si128((__m128i *)(dst + i * stride),
 3116|   105k|                     _mm256_castsi256_si128(d[i]));
 3117|   105k|  }
 3118|   118k|  for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (3118:19): [True: 105k, False: 13.1k]
  ------------------
 3119|   105k|    _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
 3120|   105k|                     _mm256_extracti128_si256(d[i], 1));
 3121|   105k|  }
 3122|   118k|  for (int i = 8; i < 16; i++) {
  ------------------
  |  Branch (3122:19): [True: 105k, False: 13.1k]
  ------------------
 3123|   105k|    _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
 3124|   105k|                     _mm256_castsi256_si128(d[i]));
 3125|   105k|  }
 3126|   118k|  for (int i = 8; i < 16; i++) {
  ------------------
  |  Branch (3126:19): [True: 105k, False: 13.1k]
  ------------------
 3127|   105k|    _mm_storeu_si128((__m128i *)(dst + (i + 16) * stride),
 3128|   105k|                     _mm256_extracti128_si256(d[i], 1));
 3129|   105k|  }
 3130|  13.1k|}
intrapred_avx2.c:highbd_dr_prediction_z3_16x64_avx2:
 3307|  4.87k|                                               int bd) {
 3308|  4.87k|  DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]);
  ------------------
  |  |   19|  4.87k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 3309|  4.87k|  if (bd < 12) {
  ------------------
  |  Branch (3309:7): [True: 4.23k, False: 631]
  ------------------
 3310|  4.23k|    highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
 3311|  4.23k|  } else {
 3312|    631|    highbd_dr_prediction_32bit_z1_64xN_avx2(16, dstT, 64, left, upsample_left,
 3313|    631|                                            dy);
 3314|    631|  }
 3315|  4.87k|  highbd_transpose(dstT, 64, dst, stride, 16, 64);
 3316|  4.87k|}
intrapred_avx2.c:highbd_dr_prediction_z3_8x4_avx2:
 2979|  66.0k|                                             int bd) {
 2980|  66.0k|  __m128i dstvec[8], d[4];
 2981|  66.0k|  if (bd < 12) {
  ------------------
  |  Branch (2981:7): [True: 37.7k, False: 28.3k]
  ------------------
 2982|  37.7k|    highbd_dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left,
 2983|  37.7k|                                              dy);
 2984|  37.7k|  } else {
 2985|  28.3k|    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(8, dstvec, left,
 2986|  28.3k|                                                    upsample_left, dy);
 2987|  28.3k|  }
 2988|       |
 2989|  66.0k|  highbd_transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
 2990|  66.0k|                               &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
 2991|  66.0k|                               &d[0], &d[1], &d[2], &d[3]);
 2992|  66.0k|  _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
 2993|  66.0k|  _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[1]);
 2994|  66.0k|  _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[2]);
 2995|  66.0k|  _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[3]);
 2996|  66.0k|}
intrapred_avx2.c:highbd_dr_prediction_z3_16x8_avx2:
 3024|  85.7k|                                              int bd) {
 3025|  85.7k|  __m128i dstvec[16], d[16];
 3026|  85.7k|  if (bd < 12) {
  ------------------
  |  Branch (3026:7): [True: 51.0k, False: 34.7k]
  ------------------
 3027|  51.0k|    highbd_dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left,
 3028|  51.0k|                                              dy);
 3029|  51.0k|  } else {
 3030|  34.7k|    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(16, dstvec, left,
 3031|  34.7k|                                                    upsample_left, dy);
 3032|  34.7k|  }
 3033|   257k|  for (int i = 0; i < 16; i += 8) {
  ------------------
  |  Branch (3033:19): [True: 171k, False: 85.7k]
  ------------------
 3034|   171k|    highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
 3035|   171k|                             &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
 3036|   171k|                             &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
 3037|   171k|                             &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
 3038|   171k|                             &d[5 + i], &d[6 + i], &d[7 + i]);
 3039|   171k|  }
 3040|   771k|  for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (3040:19): [True: 685k, False: 85.7k]
  ------------------
 3041|   685k|    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
 3042|   685k|    _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
 3043|   685k|  }
 3044|  85.7k|}
intrapred_avx2.c:highbd_dr_prediction_z3_32x16_avx2:
 3262|  30.4k|                                               int bd) {
 3263|  30.4k|  __m256i dstvec[32], d[16];
 3264|  30.4k|  if (bd < 12) {
  ------------------
  |  Branch (3264:7): [True: 27.6k, False: 2.74k]
  ------------------
 3265|  27.6k|    highbd_dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left,
 3266|  27.6k|                                               dy);
 3267|  27.6k|  } else {
 3268|  2.74k|    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(32, dstvec, left,
 3269|  2.74k|                                                     upsample_left, dy);
 3270|  2.74k|  }
 3271|  91.2k|  for (int i = 0; i < 32; i += 16) {
  ------------------
  |  Branch (3271:19): [True: 60.8k, False: 30.4k]
  ------------------
 3272|  60.8k|    highbd_transpose16x16_avx2((dstvec + i), d);
 3273|  1.03M|    for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (3273:21): [True: 973k, False: 60.8k]
  ------------------
 3274|   973k|      _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
 3275|   973k|    }
 3276|  60.8k|  }
 3277|  30.4k|}
intrapred_avx2.c:highbd_dr_prediction_z3_64x32_avx2:
 3296|  2.94k|                                               int bd) {
 3297|  2.94k|  DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]);
  ------------------
  |  |   19|  2.94k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 3298|  2.94k|  highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy, bd);
 3299|  2.94k|  highbd_transpose(dstT, 32, dst, stride, 64, 32);
 3300|  2.94k|  return;
 3301|  2.94k|}
intrapred_avx2.c:highbd_dr_prediction_z3_16x4_avx2:
 3076|  63.8k|                                              int bd) {
 3077|  63.8k|  __m128i dstvec[16], d[8];
 3078|  63.8k|  if (bd < 12) {
  ------------------
  |  Branch (3078:7): [True: 49.9k, False: 13.8k]
  ------------------
 3079|  49.9k|    highbd_dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left,
 3080|  49.9k|                                              dy);
 3081|  49.9k|  } else {
 3082|  13.8k|    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(16, dstvec, left,
 3083|  13.8k|                                                    upsample_left, dy);
 3084|  13.8k|  }
 3085|  63.8k|  highbd_transpose16x4_8x8_sse2(dstvec, d);
 3086|       |
 3087|  63.8k|  _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
 3088|  63.8k|  _mm_storeu_si128((__m128i *)(dst + 0 * stride + 8), d[1]);
 3089|  63.8k|  _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[2]);
 3090|  63.8k|  _mm_storeu_si128((__m128i *)(dst + 1 * stride + 8), d[3]);
 3091|  63.8k|  _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[4]);
 3092|  63.8k|  _mm_storeu_si128((__m128i *)(dst + 2 * stride + 8), d[5]);
 3093|  63.8k|  _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[6]);
 3094|  63.8k|  _mm_storeu_si128((__m128i *)(dst + 3 * stride + 8), d[7]);
 3095|  63.8k|}
intrapred_avx2.c:highbd_transpose16x4_8x8_sse2:
  139|  63.8k|static inline void highbd_transpose16x4_8x8_sse2(__m128i *x, __m128i *d) {
  140|  63.8k|  __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
  141|       |
  142|  63.8k|  r0 = _mm_unpacklo_epi16(x[0], x[1]);
  143|  63.8k|  r1 = _mm_unpacklo_epi16(x[2], x[3]);
  144|  63.8k|  r2 = _mm_unpacklo_epi16(x[4], x[5]);
  145|  63.8k|  r3 = _mm_unpacklo_epi16(x[6], x[7]);
  146|       |
  147|  63.8k|  r4 = _mm_unpacklo_epi16(x[8], x[9]);
  148|  63.8k|  r5 = _mm_unpacklo_epi16(x[10], x[11]);
  149|  63.8k|  r6 = _mm_unpacklo_epi16(x[12], x[13]);
  150|  63.8k|  r7 = _mm_unpacklo_epi16(x[14], x[15]);
  151|       |
  152|  63.8k|  r8 = _mm_unpacklo_epi32(r0, r1);
  153|  63.8k|  r9 = _mm_unpackhi_epi32(r0, r1);
  154|  63.8k|  r10 = _mm_unpacklo_epi32(r2, r3);
  155|  63.8k|  r11 = _mm_unpackhi_epi32(r2, r3);
  156|       |
  157|  63.8k|  r12 = _mm_unpacklo_epi32(r4, r5);
  158|  63.8k|  r13 = _mm_unpackhi_epi32(r4, r5);
  159|  63.8k|  r14 = _mm_unpacklo_epi32(r6, r7);
  160|  63.8k|  r15 = _mm_unpackhi_epi32(r6, r7);
  161|       |
  162|  63.8k|  r0 = _mm_unpacklo_epi64(r8, r9);
  163|  63.8k|  r1 = _mm_unpackhi_epi64(r8, r9);
  164|  63.8k|  r2 = _mm_unpacklo_epi64(r10, r11);
  165|  63.8k|  r3 = _mm_unpackhi_epi64(r10, r11);
  166|       |
  167|  63.8k|  r4 = _mm_unpacklo_epi64(r12, r13);
  168|  63.8k|  r5 = _mm_unpackhi_epi64(r12, r13);
  169|  63.8k|  r6 = _mm_unpacklo_epi64(r14, r15);
  170|  63.8k|  r7 = _mm_unpackhi_epi64(r14, r15);
  171|       |
  172|  63.8k|  d[0] = _mm_unpacklo_epi64(r0, r2);
  173|  63.8k|  d[1] = _mm_unpacklo_epi64(r4, r6);
  174|  63.8k|  d[2] = _mm_unpacklo_epi64(r1, r3);
  175|  63.8k|  d[3] = _mm_unpacklo_epi64(r5, r7);
  176|       |
  177|  63.8k|  d[4] = _mm_unpackhi_epi64(r0, r2);
  178|  63.8k|  d[5] = _mm_unpackhi_epi64(r4, r6);
  179|  63.8k|  d[6] = _mm_unpackhi_epi64(r1, r3);
  180|  63.8k|  d[7] = _mm_unpackhi_epi64(r5, r7);
  181|  63.8k|}
intrapred_avx2.c:highbd_dr_prediction_z3_32x8_avx2:
 3135|  57.0k|                                              int bd) {
 3136|  57.0k|  __m128i dstvec[32], d[32];
 3137|  57.0k|  if (bd < 12) {
  ------------------
  |  Branch (3137:7): [True: 48.8k, False: 8.23k]
  ------------------
 3138|  48.8k|    highbd_dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left,
 3139|  48.8k|                                              dy);
 3140|  48.8k|  } else {
 3141|  8.23k|    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(32, dstvec, left,
 3142|  8.23k|                                                    upsample_left, dy);
 3143|  8.23k|  }
 3144|       |
 3145|   285k|  for (int i = 0; i < 32; i += 8) {
  ------------------
  |  Branch (3145:19): [True: 228k, False: 57.0k]
  ------------------
 3146|   228k|    highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
 3147|   228k|                             &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
 3148|   228k|                             &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
 3149|   228k|                             &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
 3150|   228k|                             &d[5 + i], &d[6 + i], &d[7 + i]);
 3151|   228k|  }
 3152|   513k|  for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (3152:19): [True: 456k, False: 57.0k]
  ------------------
 3153|   456k|    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
 3154|   456k|    _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
 3155|   456k|    _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 16]);
 3156|   456k|    _mm_storeu_si128((__m128i *)(dst + i * stride + 24), d[i + 24]);
 3157|   456k|  }
 3158|  57.0k|}
intrapred_avx2.c:highbd_dr_prediction_z3_64x16_avx2:
 3321|  17.3k|                                               int bd) {
 3322|  17.3k|  __m256i dstvec[64], d[16];
 3323|  17.3k|  if (bd < 12) {
  ------------------
  |  Branch (3323:7): [True: 16.7k, False: 605]
  ------------------
 3324|  16.7k|    highbd_dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left,
 3325|  16.7k|                                               dy);
 3326|  16.7k|  } else {
 3327|    605|    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(64, dstvec, left,
 3328|    605|                                                     upsample_left, dy);
 3329|    605|  }
 3330|  86.7k|  for (int i = 0; i < 64; i += 16) {
  ------------------
  |  Branch (3330:19): [True: 69.3k, False: 17.3k]
  ------------------
 3331|  69.3k|    highbd_transpose16x16_avx2((dstvec + i), d);
 3332|  1.17M|    for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (3332:21): [True: 1.10M, False: 69.3k]
  ------------------
 3333|  1.10M|      _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
 3334|  1.10M|    }
 3335|  69.3k|  }
 3336|  17.3k|}
intrapred_avx2.c:dr_prediction_z1_4xN_avx2:
 3621|   138k|                                      int dx) {
 3622|   138k|  __m128i dstvec[16];
 3623|       |
 3624|   138k|  dr_prediction_z1_HxW_internal_avx2(4, N, dstvec, above, upsample_above, dx);
 3625|   965k|  for (int i = 0; i < N; i++) {
  ------------------
  |  Branch (3625:19): [True: 826k, False: 138k]
  ------------------
 3626|   826k|    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
 3627|   826k|  }
 3628|   138k|}
intrapred_avx2.c:dr_prediction_z1_HxW_internal_avx2:
 3550|   978k|    int dx) {
 3551|   978k|  const int frac_bits = 6 - upsample_above;
 3552|   978k|  const int max_base_x = ((W + H) - 1) << upsample_above;
 3553|       |
 3554|   978k|  assert(dx > 0);
 3555|       |  // pre-filter above pixels
 3556|       |  // store in temp buffers:
 3557|       |  //   above[x] * 32 + 16
 3558|       |  //   above[x+1] - above[x]
 3559|       |  // final pixels will be calculated as:
 3560|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 3561|   978k|  __m256i a0, a1, a32, a16;
 3562|   978k|  __m256i diff, c3f;
 3563|   978k|  __m128i a_mbase_x;
 3564|       |
 3565|   978k|  a16 = _mm256_set1_epi16(16);
 3566|   978k|  a_mbase_x = _mm_set1_epi8((int8_t)above[max_base_x]);
 3567|   978k|  c3f = _mm256_set1_epi16(0x3f);
 3568|       |
 3569|   978k|  int x = dx;
 3570|  12.8M|  for (int r = 0; r < W; r++) {
  ------------------
  |  Branch (3570:19): [True: 11.8M, False: 973k]
  ------------------
 3571|  11.8M|    __m256i b, res, shift;
 3572|  11.8M|    __m128i res1, a0_128, a1_128;
 3573|       |
 3574|  11.8M|    int base = x >> frac_bits;
 3575|  11.8M|    int base_max_diff = (max_base_x - base) >> upsample_above;
 3576|  11.8M|    if (base_max_diff <= 0) {
  ------------------
  |  Branch (3576:9): [True: 4.96k, False: 11.8M]
  ------------------
 3577|  15.6k|      for (int i = r; i < W; ++i) {
  ------------------
  |  Branch (3577:23): [True: 10.6k, False: 4.96k]
  ------------------
 3578|  10.6k|        dst[i] = a_mbase_x;  // save 4 values
 3579|  10.6k|      }
 3580|  4.96k|      return;
 3581|  4.96k|    }
 3582|  11.8M|    if (base_max_diff > H) base_max_diff = H;
  ------------------
  |  Branch (3582:9): [True: 11.5M, False: 305k]
  ------------------
 3583|  11.8M|    a0_128 = _mm_loadu_si128((__m128i *)(above + base));
 3584|  11.8M|    a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
 3585|       |
 3586|  11.8M|    if (upsample_above) {
  ------------------
  |  Branch (3586:9): [True: 1.94M, False: 9.88M]
  ------------------
 3587|  1.94M|      a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)EvenOddMaskx[0]);
 3588|  1.94M|      a1_128 = _mm_srli_si128(a0_128, 8);
 3589|       |
 3590|  1.94M|      shift = _mm256_srli_epi16(
 3591|  1.94M|          _mm256_and_si256(
 3592|  1.94M|              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
 3593|  1.94M|          1);
 3594|  9.88M|    } else {
 3595|  9.88M|      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
 3596|  9.88M|    }
 3597|  11.8M|    a0 = _mm256_cvtepu8_epi16(a0_128);
 3598|  11.8M|    a1 = _mm256_cvtepu8_epi16(a1_128);
 3599|       |
 3600|  11.8M|    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
 3601|  11.8M|    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
 3602|  11.8M|    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
 3603|       |
 3604|  11.8M|    b = _mm256_mullo_epi16(diff, shift);
 3605|  11.8M|    res = _mm256_add_epi16(a32, b);
 3606|  11.8M|    res = _mm256_srli_epi16(res, 5);
 3607|       |
 3608|  11.8M|    res = _mm256_packus_epi16(
 3609|  11.8M|        res, _mm256_castsi128_si256(
 3610|  11.8M|                 _mm256_extracti128_si256(res, 1)));  // goto 8 bit
 3611|  11.8M|    res1 = _mm256_castsi256_si128(res);               // 16 8bit values
 3612|       |
 3613|  11.8M|    dst[r] =
 3614|  11.8M|        _mm_blendv_epi8(a_mbase_x, res1, *(__m128i *)BaseMask[base_max_diff]);
 3615|  11.8M|    x += dx;
 3616|  11.8M|  }
 3617|   978k|}
intrapred_avx2.c:dr_prediction_z1_8xN_avx2:
 3632|   126k|                                      int dx) {
 3633|   126k|  __m128i dstvec[32];
 3634|       |
 3635|   126k|  dr_prediction_z1_HxW_internal_avx2(8, N, dstvec, above, upsample_above, dx);
 3636|  1.36M|  for (int i = 0; i < N; i++) {
  ------------------
  |  Branch (3636:19): [True: 1.23M, False: 126k]
  ------------------
 3637|  1.23M|    _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
 3638|  1.23M|  }
 3639|   126k|}
intrapred_avx2.c:dr_prediction_z1_16xN_avx2:
 3643|   115k|                                       int dx) {
 3644|   115k|  __m128i dstvec[64];
 3645|       |
 3646|   115k|  dr_prediction_z1_HxW_internal_avx2(16, N, dstvec, above, upsample_above, dx);
 3647|  1.71M|  for (int i = 0; i < N; i++) {
  ------------------
  |  Branch (3647:19): [True: 1.59M, False: 115k]
  ------------------
 3648|  1.59M|    _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
 3649|  1.59M|  }
 3650|   115k|}
intrapred_avx2.c:dr_prediction_z1_32xN_avx2:
 3724|  76.0k|                                       int dx) {
 3725|  76.0k|  __m256i dstvec[64];
 3726|  76.0k|  dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx);
 3727|  2.15M|  for (int i = 0; i < N; i++) {
  ------------------
  |  Branch (3727:19): [True: 2.07M, False: 76.0k]
  ------------------
 3728|  2.07M|    _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
 3729|  2.07M|  }
 3730|  76.0k|}
intrapred_avx2.c:dr_prediction_z1_32xN_internal_avx2:
 3653|   188k|    int N, __m256i *dstvec, const uint8_t *above, int upsample_above, int dx) {
 3654|       |  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
 3655|   188k|  (void)upsample_above;
 3656|   188k|  const int frac_bits = 6;
 3657|   188k|  const int max_base_x = ((32 + N) - 1);
 3658|       |
 3659|       |  // pre-filter above pixels
 3660|       |  // store in temp buffers:
 3661|       |  //   above[x] * 32 + 16
 3662|       |  //   above[x+1] - above[x]
 3663|       |  // final pixels will be calculated as:
 3664|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 3665|   188k|  __m256i a0, a1, a32, a16;
 3666|   188k|  __m256i a_mbase_x, diff, c3f;
 3667|       |
 3668|   188k|  a16 = _mm256_set1_epi16(16);
 3669|   188k|  a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
 3670|   188k|  c3f = _mm256_set1_epi16(0x3f);
 3671|       |
 3672|   188k|  int x = dx;
 3673|  5.28M|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (3673:19): [True: 5.09M, False: 188k]
  ------------------
 3674|  5.09M|    __m256i b, res, res16[2];
 3675|  5.09M|    __m128i a0_128, a1_128;
 3676|       |
 3677|  5.09M|    int base = x >> frac_bits;
 3678|  5.09M|    int base_max_diff = (max_base_x - base);
 3679|  5.09M|    if (base_max_diff <= 0) {
  ------------------
  |  Branch (3679:9): [True: 0, False: 5.09M]
  ------------------
 3680|      0|      for (int i = r; i < N; ++i) {
  ------------------
  |  Branch (3680:23): [True: 0, False: 0]
  ------------------
 3681|      0|        dstvec[i] = a_mbase_x;  // save 32 values
 3682|      0|      }
 3683|      0|      return;
 3684|      0|    }
 3685|  5.09M|    if (base_max_diff > 32) base_max_diff = 32;
  ------------------
  |  Branch (3685:9): [True: 5.02M, False: 74.3k]
  ------------------
 3686|  5.09M|    __m256i shift =
 3687|  5.09M|        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
 3688|       |
 3689|  15.2M|    for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
  ------------------
  |  Branch (3689:29): [True: 10.1M, False: 5.09M]
  ------------------
 3690|  10.1M|      int mdiff = base_max_diff - j;
 3691|  10.1M|      if (mdiff <= 0) {
  ------------------
  |  Branch (3691:11): [True: 592, False: 10.1M]
  ------------------
 3692|    592|        res16[jj] = a_mbase_x;
 3693|  10.1M|      } else {
 3694|  10.1M|        a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
 3695|  10.1M|        a1_128 = _mm_loadu_si128((__m128i *)(above + base + j + 1));
 3696|  10.1M|        a0 = _mm256_cvtepu8_epi16(a0_128);
 3697|  10.1M|        a1 = _mm256_cvtepu8_epi16(a1_128);
 3698|       |
 3699|  10.1M|        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
 3700|  10.1M|        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
 3701|  10.1M|        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
 3702|  10.1M|        b = _mm256_mullo_epi16(diff, shift);
 3703|       |
 3704|  10.1M|        res = _mm256_add_epi16(a32, b);
 3705|  10.1M|        res = _mm256_srli_epi16(res, 5);
 3706|  10.1M|        res16[jj] = _mm256_packus_epi16(
 3707|  10.1M|            res, _mm256_castsi128_si256(
 3708|  10.1M|                     _mm256_extracti128_si256(res, 1)));  // 16 8bit values
 3709|  10.1M|      }
 3710|  10.1M|    }
 3711|  5.09M|    res16[1] =
 3712|  5.09M|        _mm256_inserti128_si256(res16[0], _mm256_castsi256_si128(res16[1]),
 3713|  5.09M|                                1);  // 32 8bit values
 3714|       |
 3715|  5.09M|    dstvec[r] = _mm256_blendv_epi8(
 3716|  5.09M|        a_mbase_x, res16[1],
 3717|  5.09M|        *(__m256i *)BaseMask[base_max_diff]);  // 32 8bit values
 3718|  5.09M|    x += dx;
 3719|  5.09M|  }
 3720|   188k|}
intrapred_avx2.c:dr_prediction_z1_64xN_avx2:
 3734|  43.2k|                                       int dx) {
 3735|       |  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
 3736|  43.2k|  (void)upsample_above;
 3737|  43.2k|  const int frac_bits = 6;
 3738|  43.2k|  const int max_base_x = ((64 + N) - 1);
 3739|       |
 3740|       |  // pre-filter above pixels
 3741|       |  // store in temp buffers:
 3742|       |  //   above[x] * 32 + 16
 3743|       |  //   above[x+1] - above[x]
 3744|       |  // final pixels will be calculated as:
 3745|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 3746|  43.2k|  __m256i a0, a1, a32, a16;
 3747|  43.2k|  __m256i a_mbase_x, diff, c3f;
 3748|  43.2k|  __m128i max_base_x128, base_inc128, mask128;
 3749|       |
 3750|  43.2k|  a16 = _mm256_set1_epi16(16);
 3751|  43.2k|  a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
 3752|  43.2k|  max_base_x128 = _mm_set1_epi8(max_base_x);
 3753|  43.2k|  c3f = _mm256_set1_epi16(0x3f);
 3754|       |
 3755|  43.2k|  int x = dx;
 3756|  2.35M|  for (int r = 0; r < N; r++, dst += stride) {
  ------------------
  |  Branch (3756:19): [True: 2.31M, False: 43.2k]
  ------------------
 3757|  2.31M|    __m256i b, res;
 3758|  2.31M|    int base = x >> frac_bits;
 3759|  2.31M|    if (base >= max_base_x) {
  ------------------
  |  Branch (3759:9): [True: 0, False: 2.31M]
  ------------------
 3760|      0|      for (int i = r; i < N; ++i) {
  ------------------
  |  Branch (3760:23): [True: 0, False: 0]
  ------------------
 3761|      0|        _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
 3762|      0|        _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
 3763|      0|        dst += stride;
 3764|      0|      }
 3765|      0|      return;
 3766|      0|    }
 3767|       |
 3768|  2.31M|    __m256i shift =
 3769|  2.31M|        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
 3770|       |
 3771|  2.31M|    __m128i a0_128, a1_128, res128;
 3772|  11.5M|    for (int j = 0; j < 64; j += 16) {
  ------------------
  |  Branch (3772:21): [True: 9.25M, False: 2.31M]
  ------------------
 3773|  9.25M|      int mdif = max_base_x - (base + j);
 3774|  9.25M|      if (mdif <= 0) {
  ------------------
  |  Branch (3774:11): [True: 3.83k, False: 9.25M]
  ------------------
 3775|  3.83k|        _mm_storeu_si128((__m128i *)(dst + j),
 3776|  3.83k|                         _mm256_castsi256_si128(a_mbase_x));
 3777|  9.25M|      } else {
 3778|  9.25M|        a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
 3779|  9.25M|        a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
 3780|  9.25M|        a0 = _mm256_cvtepu8_epi16(a0_128);
 3781|  9.25M|        a1 = _mm256_cvtepu8_epi16(a1_128);
 3782|       |
 3783|  9.25M|        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
 3784|  9.25M|        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
 3785|  9.25M|        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
 3786|  9.25M|        b = _mm256_mullo_epi16(diff, shift);
 3787|       |
 3788|  9.25M|        res = _mm256_add_epi16(a32, b);
 3789|  9.25M|        res = _mm256_srli_epi16(res, 5);
 3790|  9.25M|        res = _mm256_packus_epi16(
 3791|  9.25M|            res, _mm256_castsi128_si256(
 3792|  9.25M|                     _mm256_extracti128_si256(res, 1)));  // 16 8bit values
 3793|       |
 3794|  9.25M|        base_inc128 =
 3795|  9.25M|            _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1),
 3796|  9.25M|                          (int8_t)(base + j + 2), (int8_t)(base + j + 3),
 3797|  9.25M|                          (int8_t)(base + j + 4), (int8_t)(base + j + 5),
 3798|  9.25M|                          (int8_t)(base + j + 6), (int8_t)(base + j + 7),
 3799|  9.25M|                          (int8_t)(base + j + 8), (int8_t)(base + j + 9),
 3800|  9.25M|                          (int8_t)(base + j + 10), (int8_t)(base + j + 11),
 3801|  9.25M|                          (int8_t)(base + j + 12), (int8_t)(base + j + 13),
 3802|  9.25M|                          (int8_t)(base + j + 14), (int8_t)(base + j + 15));
 3803|       |
 3804|  9.25M|        mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128),
 3805|  9.25M|                                 _mm_setzero_si128());
 3806|  9.25M|        res128 = _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x),
 3807|  9.25M|                                 _mm256_castsi256_si128(res), mask128);
 3808|  9.25M|        _mm_storeu_si128((__m128i *)(dst + j), res128);
 3809|  9.25M|      }
 3810|  9.25M|    }
 3811|  2.31M|    x += dx;
 3812|  2.31M|  }
 3813|  43.2k|}
intrapred_avx2.c:dr_prediction_z2_Nx4_avx2:
 3845|   576k|                                      int dx, int dy) {
 3846|   576k|  const int min_base_x = -(1 << upsample_above);
 3847|   576k|  const int min_base_y = -(1 << upsample_left);
 3848|   576k|  const int frac_bits_x = 6 - upsample_above;
 3849|   576k|  const int frac_bits_y = 6 - upsample_left;
 3850|       |
 3851|   576k|  assert(dx > 0);
 3852|       |  // pre-filter above pixels
 3853|       |  // store in temp buffers:
 3854|       |  //   above[x] * 32 + 16
 3855|       |  //   above[x+1] - above[x]
 3856|       |  // final pixels will be calculated as:
 3857|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 3858|   576k|  __m128i a0_x, a1_x, a32, a16, diff;
 3859|   576k|  __m128i c3f, min_base_y128, c1234, dy128;
 3860|       |
 3861|   576k|  a16 = _mm_set1_epi16(16);
 3862|   576k|  c3f = _mm_set1_epi16(0x3f);
 3863|   576k|  min_base_y128 = _mm_set1_epi16(min_base_y);
 3864|   576k|  c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0);
 3865|   576k|  dy128 = _mm_set1_epi16(dy);
 3866|       |
 3867|  3.44M|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (3867:19): [True: 2.87M, False: 576k]
  ------------------
 3868|  2.87M|    __m128i b, res, shift, r6, ydx;
 3869|  2.87M|    __m128i resx, resy, resxy;
 3870|  2.87M|    __m128i a0_x128, a1_x128;
 3871|  2.87M|    int y = r + 1;
 3872|  2.87M|    int base_x = (-y * dx) >> frac_bits_x;
 3873|  2.87M|    int base_shift = 0;
 3874|  2.87M|    if (base_x < (min_base_x - 1)) {
  ------------------
  |  Branch (3874:9): [True: 2.33M, False: 542k]
  ------------------
 3875|  2.33M|      base_shift = (min_base_x - base_x - 1) >> upsample_above;
 3876|  2.33M|    }
 3877|  2.87M|    int base_min_diff =
 3878|  2.87M|        (min_base_x - base_x + upsample_above) >> upsample_above;
 3879|  2.87M|    if (base_min_diff > 4) {
  ------------------
  |  Branch (3879:9): [True: 1.65M, False: 1.22M]
  ------------------
 3880|  1.65M|      base_min_diff = 4;
 3881|  1.65M|    } else {
 3882|  1.22M|      if (base_min_diff < 0) base_min_diff = 0;
  ------------------
  |  Branch (3882:11): [True: 0, False: 1.22M]
  ------------------
 3883|  1.22M|    }
 3884|       |
 3885|  2.87M|    if (base_shift > 3) {
  ------------------
  |  Branch (3885:9): [True: 1.65M, False: 1.22M]
  ------------------
 3886|  1.65M|      a0_x = _mm_setzero_si128();
 3887|  1.65M|      a1_x = _mm_setzero_si128();
 3888|  1.65M|      shift = _mm_setzero_si128();
 3889|  1.65M|    } else {
 3890|  1.22M|      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
 3891|  1.22M|      ydx = _mm_set1_epi16(y * dx);
 3892|  1.22M|      r6 = _mm_slli_epi16(c1234, 6);
 3893|       |
 3894|  1.22M|      if (upsample_above) {
  ------------------
  |  Branch (3894:11): [True: 272k, False: 950k]
  ------------------
 3895|   272k|        a0_x128 =
 3896|   272k|            _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
 3897|   272k|        a1_x128 = _mm_srli_si128(a0_x128, 8);
 3898|       |
 3899|   272k|        shift = _mm_srli_epi16(
 3900|   272k|            _mm_and_si128(
 3901|   272k|                _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
 3902|   272k|            1);
 3903|   950k|      } else {
 3904|   950k|        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
 3905|   950k|        a1_x128 = _mm_srli_si128(a0_x128, 1);
 3906|       |
 3907|   950k|        shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
 3908|   950k|      }
 3909|  1.22M|      a0_x = _mm_cvtepu8_epi16(a0_x128);
 3910|  1.22M|      a1_x = _mm_cvtepu8_epi16(a1_x128);
 3911|  1.22M|    }
 3912|       |    // y calc
 3913|  2.87M|    __m128i a0_y, a1_y, shifty;
 3914|  2.87M|    if (base_x < min_base_x) {
  ------------------
  |  Branch (3914:9): [True: 2.57M, False: 295k]
  ------------------
 3915|  2.57M|      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
  ------------------
  |  |   19|  2.57M|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 3916|  2.57M|      __m128i y_c128, base_y_c128, mask128, c1234_;
 3917|  2.57M|      c1234_ = _mm_srli_si128(c1234, 2);
 3918|  2.57M|      r6 = _mm_set1_epi16(r << 6);
 3919|  2.57M|      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy128));
 3920|  2.57M|      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
 3921|  2.57M|      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
 3922|  2.57M|      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
 3923|  2.57M|      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
 3924|       |
 3925|  2.57M|      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
 3926|  2.57M|                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
 3927|  2.57M|      base_y_c128 = _mm_add_epi16(base_y_c128, _mm_srli_epi16(a16, 4));
 3928|  2.57M|      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
 3929|  2.57M|      a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
 3930|  2.57M|                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
 3931|       |
 3932|  2.57M|      if (upsample_left) {
  ------------------
  |  Branch (3932:11): [True: 1.79M, False: 783k]
  ------------------
 3933|  1.79M|        shifty = _mm_srli_epi16(
 3934|  1.79M|            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
 3935|  1.79M|      } else {
 3936|   783k|        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
 3937|   783k|      }
 3938|  2.57M|      a0_x = _mm_unpacklo_epi64(a0_x, a0_y);
 3939|  2.57M|      a1_x = _mm_unpacklo_epi64(a1_x, a1_y);
 3940|  2.57M|      shift = _mm_unpacklo_epi64(shift, shifty);
 3941|  2.57M|    }
 3942|       |
 3943|  2.87M|    diff = _mm_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
 3944|  2.87M|    a32 = _mm_slli_epi16(a0_x, 5);     // a[x] * 32
 3945|  2.87M|    a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
 3946|       |
 3947|  2.87M|    b = _mm_mullo_epi16(diff, shift);
 3948|  2.87M|    res = _mm_add_epi16(a32, b);
 3949|  2.87M|    res = _mm_srli_epi16(res, 5);
 3950|       |
 3951|  2.87M|    resx = _mm_packus_epi16(res, res);
 3952|  2.87M|    resy = _mm_srli_si128(resx, 4);
 3953|       |
 3954|  2.87M|    resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
 3955|  2.87M|    *(int *)(dst) = _mm_cvtsi128_si32(resxy);
 3956|  2.87M|    dst += stride;
 3957|  2.87M|  }
 3958|   576k|}
intrapred_avx2.c:dr_prediction_z2_Nx8_avx2:
 3963|   264k|                                      int dx, int dy) {
 3964|   264k|  const int min_base_x = -(1 << upsample_above);
 3965|   264k|  const int min_base_y = -(1 << upsample_left);
 3966|   264k|  const int frac_bits_x = 6 - upsample_above;
 3967|   264k|  const int frac_bits_y = 6 - upsample_left;
 3968|       |
 3969|       |  // pre-filter above pixels
 3970|       |  // store in temp buffers:
 3971|       |  //   above[x] * 32 + 16
 3972|       |  //   above[x+1] - above[x]
 3973|       |  // final pixels will be calculated as:
 3974|       |  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 3975|   264k|  __m256i diff, a32, a16;
 3976|   264k|  __m256i a0_x, a1_x;
 3977|   264k|  __m128i a0_x128, a1_x128, min_base_y128, c3f;
 3978|   264k|  __m128i c1234, dy128;
 3979|       |
 3980|   264k|  a16 = _mm256_set1_epi16(16);
 3981|   264k|  c3f = _mm_set1_epi16(0x3f);
 3982|   264k|  min_base_y128 = _mm_set1_epi16(min_base_y);
 3983|   264k|  dy128 = _mm_set1_epi16(dy);
 3984|   264k|  c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
 3985|       |
 3986|  2.71M|  for (int r = 0; r < N; r++) {
  ------------------
  |  Branch (3986:19): [True: 2.44M, False: 264k]
  ------------------
 3987|  2.44M|    __m256i b, res, shift;
 3988|  2.44M|    __m128i resx, resy, resxy, r6, ydx;
 3989|       |
 3990|  2.44M|    int y = r + 1;
 3991|  2.44M|    int base_x = (-y * dx) >> frac_bits_x;
 3992|  2.44M|    int base_shift = 0;
 3993|  2.44M|    if (base_x < (min_base_x - 1)) {
  ------------------
  |  Branch (3993:9): [True: 1.87M, False: 576k]
  ------------------
 3994|  1.87M|      base_shift = (min_base_x - base_x - 1) >> upsample_above;
 3995|  1.87M|    }
 3996|  2.44M|    int base_min_diff =
 3997|  2.44M|        (min_base_x - base_x + upsample_above) >> upsample_above;
 3998|  2.44M|    if (base_min_diff > 8) {
  ------------------
  |  Branch (3998:9): [True: 1.10M, False: 1.33M]
  ------------------
 3999|  1.10M|      base_min_diff = 8;
 4000|  1.33M|    } else {
 4001|  1.33M|      if (base_min_diff < 0) base_min_diff = 0;
  ------------------
  |  Branch (4001:11): [True: 0, False: 1.33M]
  ------------------
 4002|  1.33M|    }
 4003|       |
 4004|  2.44M|    if (base_shift > 7) {
  ------------------
  |  Branch (4004:9): [True: 1.10M, False: 1.33M]
  ------------------
 4005|  1.10M|      a0_x = _mm256_setzero_si256();
 4006|  1.10M|      a1_x = _mm256_setzero_si256();
 4007|  1.10M|      shift = _mm256_setzero_si256();
 4008|  1.33M|    } else {
 4009|  1.33M|      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
 4010|  1.33M|      ydx = _mm_set1_epi16(y * dx);
 4011|  1.33M|      r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6);
 4012|  1.33M|      if (upsample_above) {
  ------------------
  |  Branch (4012:11): [True: 408k, False: 930k]
  ------------------
 4013|   408k|        a0_x128 =
 4014|   408k|            _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
 4015|   408k|        a1_x128 = _mm_srli_si128(a0_x128, 8);
 4016|       |
 4017|   408k|        shift = _mm256_castsi128_si256(_mm_srli_epi16(
 4018|   408k|            _mm_and_si128(
 4019|   408k|                _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
 4020|   408k|            1));
 4021|   930k|      } else {
 4022|   930k|        a1_x128 = _mm_srli_si128(a0_x128, 1);
 4023|   930k|        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
 4024|   930k|        a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
 4025|       |
 4026|   930k|        shift = _mm256_castsi128_si256(
 4027|   930k|            _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1));
 4028|   930k|      }
 4029|  1.33M|      a0_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a0_x128));
 4030|  1.33M|      a1_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a1_x128));
 4031|  1.33M|    }
 4032|       |
 4033|       |    // y calc
 4034|  2.44M|    __m128i a0_y, a1_y, shifty;
 4035|  2.44M|    if (base_x < min_base_x) {
  ------------------
  |  Branch (4035:9): [True: 2.07M, False: 375k]
  ------------------
 4036|  2.07M|      DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
  ------------------
  |  |   19|  2.07M|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 4037|  2.07M|      __m128i y_c128, base_y_c128, mask128;
 4038|  2.07M|      r6 = _mm_set1_epi16(r << 6);
 4039|  2.07M|      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
 4040|  2.07M|      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
 4041|  2.07M|      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
 4042|  2.07M|      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
 4043|  2.07M|      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
 4044|       |
 4045|  2.07M|      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
 4046|  2.07M|                            left[base_y_c[2]], left[base_y_c[3]],
 4047|  2.07M|                            left[base_y_c[4]], left[base_y_c[5]],
 4048|  2.07M|                            left[base_y_c[6]], left[base_y_c[7]]);
 4049|  2.07M|      base_y_c128 = _mm_add_epi16(
 4050|  2.07M|          base_y_c128, _mm_srli_epi16(_mm256_castsi256_si128(a16), 4));
 4051|  2.07M|      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
 4052|       |
 4053|  2.07M|      a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
 4054|  2.07M|                            left[base_y_c[2]], left[base_y_c[3]],
 4055|  2.07M|                            left[base_y_c[4]], left[base_y_c[5]],
 4056|  2.07M|                            left[base_y_c[6]], left[base_y_c[7]]);
 4057|       |
 4058|  2.07M|      if (upsample_left) {
  ------------------
  |  Branch (4058:11): [True: 614k, False: 1.45M]
  ------------------
 4059|   614k|        shifty = _mm_srli_epi16(
 4060|   614k|            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
 4061|  1.45M|      } else {
 4062|  1.45M|        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
 4063|  1.45M|      }
 4064|       |
 4065|  2.07M|      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
 4066|  2.07M|      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
 4067|  2.07M|      shift = _mm256_inserti128_si256(shift, shifty, 1);
 4068|  2.07M|    }
 4069|       |
 4070|  2.44M|    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
 4071|  2.44M|    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
 4072|  2.44M|    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
 4073|       |
 4074|  2.44M|    b = _mm256_mullo_epi16(diff, shift);
 4075|  2.44M|    res = _mm256_add_epi16(a32, b);
 4076|  2.44M|    res = _mm256_srli_epi16(res, 5);
 4077|       |
 4078|  2.44M|    resx = _mm_packus_epi16(_mm256_castsi256_si128(res),
 4079|  2.44M|                            _mm256_castsi256_si128(res));
 4080|  2.44M|    resy = _mm256_extracti128_si256(res, 1);
 4081|  2.44M|    resy = _mm_packus_epi16(resy, resy);
 4082|       |
 4083|  2.44M|    resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
 4084|  2.44M|    _mm_storel_epi64((__m128i *)(dst), resxy);
 4085|  2.44M|    dst += stride;
 4086|  2.44M|  }
 4087|   264k|}
intrapred_avx2.c:dr_prediction_z2_HxW_avx2:
 4092|   446k|                                      int upsample_left, int dx, int dy) {
 4093|       |  // here upsample_above and upsample_left are 0 by design of
 4094|       |  // av1_use_intra_edge_upsample
 4095|   446k|  const int min_base_x = -1;
 4096|   446k|  const int min_base_y = -1;
 4097|   446k|  (void)upsample_above;
 4098|   446k|  (void)upsample_left;
 4099|   446k|  const int frac_bits_x = 6;
 4100|   446k|  const int frac_bits_y = 6;
 4101|       |
 4102|   446k|  __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c1234, c0123;
 4103|   446k|  __m256i diff, min_base_y256, c3f, shifty, dy256, c1;
 4104|   446k|  __m128i a0_x128, a1_x128;
 4105|       |
 4106|   446k|  DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
  ------------------
  |  |   19|   446k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 4107|   446k|  a16 = _mm256_set1_epi16(16);
 4108|   446k|  c1 = _mm256_srli_epi16(a16, 4);
 4109|   446k|  min_base_y256 = _mm256_set1_epi16(min_base_y);
 4110|   446k|  c3f = _mm256_set1_epi16(0x3f);
 4111|   446k|  dy256 = _mm256_set1_epi16(dy);
 4112|   446k|  c0123 =
 4113|   446k|      _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 4114|   446k|  c1234 = _mm256_add_epi16(c0123, c1);
 4115|       |
 4116|  8.84M|  for (int r = 0; r < H; r++) {
  ------------------
  |  Branch (4116:19): [True: 8.40M, False: 446k]
  ------------------
 4117|  8.40M|    __m256i b, res, shift, j256, r6, ydx;
 4118|  8.40M|    __m128i resx, resy;
 4119|  8.40M|    __m128i resxy;
 4120|  8.40M|    int y = r + 1;
 4121|  8.40M|    ydx = _mm256_set1_epi16((int16_t)(y * dx));
 4122|       |
 4123|  8.40M|    int base_x = (-y * dx) >> frac_bits_x;
 4124|  23.7M|    for (int j = 0; j < W; j += 16) {
  ------------------
  |  Branch (4124:21): [True: 15.3M, False: 8.40M]
  ------------------
 4125|  15.3M|      j256 = _mm256_set1_epi16(j);
 4126|  15.3M|      int base_shift = 0;
 4127|  15.3M|      if ((base_x + j) < (min_base_x - 1)) {
  ------------------
  |  Branch (4127:11): [True: 11.0M, False: 4.33M]
  ------------------
 4128|  11.0M|        base_shift = (min_base_x - (base_x + j) - 1);
 4129|  11.0M|      }
 4130|  15.3M|      int base_min_diff = (min_base_x - base_x - j);
 4131|  15.3M|      if (base_min_diff > 16) {
  ------------------
  |  Branch (4131:11): [True: 7.80M, False: 7.57M]
  ------------------
 4132|  7.80M|        base_min_diff = 16;
 4133|  7.80M|      } else {
 4134|  7.57M|        if (base_min_diff < 0) base_min_diff = 0;
  ------------------
  |  Branch (4134:13): [True: 2.95M, False: 4.62M]
  ------------------
 4135|  7.57M|      }
 4136|       |
 4137|  15.3M|      if (base_shift < 16) {
  ------------------
  |  Branch (4137:11): [True: 7.58M, False: 7.79M]
  ------------------
 4138|  7.58M|        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
 4139|  7.58M|        a1_x128 =
 4140|  7.58M|            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
 4141|  7.58M|        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
 4142|  7.58M|        a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
 4143|       |
 4144|  7.58M|        a0_x = _mm256_cvtepu8_epi16(a0_x128);
 4145|  7.58M|        a1_x = _mm256_cvtepu8_epi16(a1_x128);
 4146|       |
 4147|  7.58M|        r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
 4148|  7.58M|        shift = _mm256_srli_epi16(
 4149|  7.58M|            _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
 4150|       |
 4151|  7.58M|        diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
 4152|  7.58M|        a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
 4153|  7.58M|        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
 4154|       |
 4155|  7.58M|        b = _mm256_mullo_epi16(diff, shift);
 4156|  7.58M|        res = _mm256_add_epi16(a32, b);
 4157|  7.58M|        res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
 4158|  7.58M|        resx = _mm256_castsi256_si128(_mm256_packus_epi16(
 4159|  7.58M|            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
 4160|  7.79M|      } else {
 4161|  7.79M|        resx = _mm_setzero_si128();
 4162|  7.79M|      }
 4163|       |
 4164|       |      // y calc
 4165|  15.3M|      if (base_x < min_base_x) {
  ------------------
  |  Branch (4165:11): [True: 14.2M, False: 1.15M]
  ------------------
 4166|  14.2M|        __m256i c256, y_c256, base_y_c256, mask256, mul16;
 4167|  14.2M|        r6 = _mm256_set1_epi16(r << 6);
 4168|  14.2M|        c256 = _mm256_add_epi16(j256, c1234);
 4169|  14.2M|        mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
 4170|  14.2M|                                 _mm256_srli_epi16(min_base_y256, 1));
 4171|  14.2M|        y_c256 = _mm256_sub_epi16(r6, mul16);
 4172|       |
 4173|  14.2M|        base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
 4174|  14.2M|        mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
 4175|       |
 4176|  14.2M|        base_y_c256 = _mm256_blendv_epi8(base_y_c256, min_base_y256, mask256);
 4177|  14.2M|        int16_t min_y = (int16_t)_mm_extract_epi16(
 4178|  14.2M|            _mm256_extracti128_si256(base_y_c256, 1), 7);
 4179|  14.2M|        int16_t max_y =
 4180|  14.2M|            (int16_t)_mm_extract_epi16(_mm256_castsi256_si128(base_y_c256), 0);
 4181|  14.2M|        int16_t offset_diff = max_y - min_y;
 4182|       |
 4183|  14.2M|        if (offset_diff < 16) {
  ------------------
  |  Branch (4183:13): [True: 13.3M, False: 871k]
  ------------------
 4184|  13.3M|          __m256i min_y256 = _mm256_set1_epi16(min_y);
 4185|       |
 4186|  13.3M|          __m256i base_y_offset = _mm256_sub_epi16(base_y_c256, min_y256);
 4187|  13.3M|          __m128i base_y_offset128 =
 4188|  13.3M|              _mm_packs_epi16(_mm256_extracti128_si256(base_y_offset, 0),
 4189|  13.3M|                              _mm256_extracti128_si256(base_y_offset, 1));
 4190|       |
 4191|  13.3M|          __m128i a0_y128 = _mm_maskload_epi32(
 4192|  13.3M|              (int *)(left + min_y), *(__m128i *)LoadMaskz2[offset_diff / 4]);
 4193|  13.3M|          __m128i a1_y128 =
 4194|  13.3M|              _mm_maskload_epi32((int *)(left + min_y + 1),
 4195|  13.3M|                                 *(__m128i *)LoadMaskz2[offset_diff / 4]);
 4196|  13.3M|          a0_y128 = _mm_shuffle_epi8(a0_y128, base_y_offset128);
 4197|  13.3M|          a1_y128 = _mm_shuffle_epi8(a1_y128, base_y_offset128);
 4198|  13.3M|          a0_y = _mm256_cvtepu8_epi16(a0_y128);
 4199|  13.3M|          a1_y = _mm256_cvtepu8_epi16(a1_y128);
 4200|  13.3M|        } else {
 4201|   871k|          base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
 4202|   871k|          _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
 4203|       |
 4204|   871k|          a0_y = _mm256_setr_epi16(
 4205|   871k|              left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
 4206|   871k|              left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
 4207|   871k|              left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
 4208|   871k|              left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
 4209|   871k|              left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
 4210|   871k|              left[base_y_c[15]]);
 4211|   871k|          base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
 4212|   871k|          _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
 4213|       |
 4214|   871k|          a1_y = _mm256_setr_epi16(
 4215|   871k|              left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
 4216|   871k|              left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
 4217|   871k|              left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
 4218|   871k|              left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
 4219|   871k|              left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
 4220|   871k|              left[base_y_c[15]]);
 4221|   871k|        }
 4222|  14.2M|        shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
 4223|       |
 4224|  14.2M|        diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
 4225|  14.2M|        a32 = _mm256_slli_epi16(a0_y, 5);     // a[x] * 32
 4226|  14.2M|        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
 4227|       |
 4228|  14.2M|        b = _mm256_mullo_epi16(diff, shifty);
 4229|  14.2M|        res = _mm256_add_epi16(a32, b);
 4230|  14.2M|        res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
 4231|  14.2M|        resy = _mm256_castsi256_si128(_mm256_packus_epi16(
 4232|  14.2M|            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
 4233|  14.2M|      } else {
 4234|  1.15M|        resy = _mm_setzero_si128();
 4235|  1.15M|      }
 4236|  15.3M|      resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
 4237|  15.3M|      _mm_storeu_si128((__m128i *)(dst + j), resxy);
 4238|  15.3M|    }  // for j
 4239|  8.40M|    dst += stride;
 4240|  8.40M|  }
 4241|   446k|}
intrapred_avx2.c:dr_prediction_z3_4x4_avx2:
 4360|   115k|                                      int dy) {
 4361|   115k|  __m128i dstvec[4], d[4];
 4362|       |
 4363|   115k|  dr_prediction_z1_HxW_internal_avx2(4, 4, dstvec, left, upsample_left, dy);
 4364|   115k|  transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
 4365|   115k|                            &d[0], &d[1], &d[2], &d[3]);
 4366|       |
 4367|   115k|  *(int *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]);
 4368|   115k|  *(int *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]);
 4369|   115k|  *(int *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]);
 4370|   115k|  *(int *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]);
 4371|   115k|  return;
 4372|   115k|}
intrapred_avx2.c:dr_prediction_z3_8x8_avx2:
 4376|  98.0k|                                      int dy) {
 4377|  98.0k|  __m128i dstvec[8], d[8];
 4378|       |
 4379|  98.0k|  dr_prediction_z1_HxW_internal_avx2(8, 8, dstvec, left, upsample_left, dy);
 4380|  98.0k|  transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
 4381|  98.0k|                    &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
 4382|  98.0k|                    &d[3]);
 4383|       |
 4384|  98.0k|  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
 4385|  98.0k|  _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8));
 4386|  98.0k|  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]);
 4387|  98.0k|  _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8));
 4388|  98.0k|  _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]);
 4389|  98.0k|  _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8));
 4390|  98.0k|  _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]);
 4391|  98.0k|  _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8));
 4392|  98.0k|}
intrapred_avx2.c:dr_prediction_z3_16x16_avx2:
 4538|  87.1k|                                        int dy) {
 4539|  87.1k|  __m128i dstvec[16], d[16];
 4540|       |
 4541|  87.1k|  dr_prediction_z1_HxW_internal_avx2(16, 16, dstvec, left, upsample_left, dy);
 4542|  87.1k|  transpose16x16_sse2(dstvec, d);
 4543|       |
 4544|  1.48M|  for (int i = 0; i < 16; i++) {
  ------------------
  |  Branch (4544:19): [True: 1.39M, False: 87.1k]
  ------------------
 4545|  1.39M|    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
 4546|  1.39M|  }
 4547|  87.1k|}
intrapred_avx2.c:dr_prediction_z3_32x32_avx2:
 4551|  81.0k|                                        int dy) {
 4552|  81.0k|  __m256i dstvec[32], d[32];
 4553|       |
 4554|  81.0k|  dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy);
 4555|  81.0k|  transpose16x32_avx2(dstvec, d);
 4556|  81.0k|  transpose16x32_avx2(dstvec + 16, d + 16);
 4557|  1.37M|  for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (4557:19): [True: 1.29M, False: 81.0k]
  ------------------
 4558|  1.29M|    _mm_storeu_si128((__m128i *)(dst + j * stride),
 4559|  1.29M|                     _mm256_castsi256_si128(d[j]));
 4560|  1.29M|    _mm_storeu_si128((__m128i *)(dst + j * stride + 16),
 4561|  1.29M|                     _mm256_castsi256_si128(d[j + 16]));
 4562|  1.29M|  }
 4563|  1.37M|  for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (4563:19): [True: 1.29M, False: 81.0k]
  ------------------
 4564|  1.29M|    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
 4565|  1.29M|                     _mm256_extracti128_si256(d[j], 1));
 4566|  1.29M|    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16),
 4567|  1.29M|                     _mm256_extracti128_si256(d[j + 16], 1));
 4568|  1.29M|  }
 4569|  81.0k|}
intrapred_avx2.c:transpose16x32_avx2:
 4268|   193k|static inline void transpose16x32_avx2(__m256i *x, __m256i *d) {
 4269|   193k|  __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
 4270|   193k|  __m256i w10, w11, w12, w13, w14, w15;
 4271|       |
 4272|   193k|  w0 = _mm256_unpacklo_epi8(x[0], x[1]);
 4273|   193k|  w1 = _mm256_unpacklo_epi8(x[2], x[3]);
 4274|   193k|  w2 = _mm256_unpacklo_epi8(x[4], x[5]);
 4275|   193k|  w3 = _mm256_unpacklo_epi8(x[6], x[7]);
 4276|       |
 4277|   193k|  w8 = _mm256_unpacklo_epi8(x[8], x[9]);
 4278|   193k|  w9 = _mm256_unpacklo_epi8(x[10], x[11]);
 4279|   193k|  w10 = _mm256_unpacklo_epi8(x[12], x[13]);
 4280|   193k|  w11 = _mm256_unpacklo_epi8(x[14], x[15]);
 4281|       |
 4282|   193k|  w4 = _mm256_unpacklo_epi16(w0, w1);
 4283|   193k|  w5 = _mm256_unpacklo_epi16(w2, w3);
 4284|   193k|  w12 = _mm256_unpacklo_epi16(w8, w9);
 4285|   193k|  w13 = _mm256_unpacklo_epi16(w10, w11);
 4286|       |
 4287|   193k|  w6 = _mm256_unpacklo_epi32(w4, w5);
 4288|   193k|  w7 = _mm256_unpackhi_epi32(w4, w5);
 4289|   193k|  w14 = _mm256_unpacklo_epi32(w12, w13);
 4290|   193k|  w15 = _mm256_unpackhi_epi32(w12, w13);
 4291|       |
 4292|       |  // Store first 4-line result
 4293|   193k|  d[0] = _mm256_unpacklo_epi64(w6, w14);
 4294|   193k|  d[1] = _mm256_unpackhi_epi64(w6, w14);
 4295|   193k|  d[2] = _mm256_unpacklo_epi64(w7, w15);
 4296|   193k|  d[3] = _mm256_unpackhi_epi64(w7, w15);
 4297|       |
 4298|   193k|  w4 = _mm256_unpackhi_epi16(w0, w1);
 4299|   193k|  w5 = _mm256_unpackhi_epi16(w2, w3);
 4300|   193k|  w12 = _mm256_unpackhi_epi16(w8, w9);
 4301|   193k|  w13 = _mm256_unpackhi_epi16(w10, w11);
 4302|       |
 4303|   193k|  w6 = _mm256_unpacklo_epi32(w4, w5);
 4304|   193k|  w7 = _mm256_unpackhi_epi32(w4, w5);
 4305|   193k|  w14 = _mm256_unpacklo_epi32(w12, w13);
 4306|   193k|  w15 = _mm256_unpackhi_epi32(w12, w13);
 4307|       |
 4308|       |  // Store second 4-line result
 4309|   193k|  d[4] = _mm256_unpacklo_epi64(w6, w14);
 4310|   193k|  d[5] = _mm256_unpackhi_epi64(w6, w14);
 4311|   193k|  d[6] = _mm256_unpacklo_epi64(w7, w15);
 4312|   193k|  d[7] = _mm256_unpackhi_epi64(w7, w15);
 4313|       |
 4314|       |  // upper half
 4315|   193k|  w0 = _mm256_unpackhi_epi8(x[0], x[1]);
 4316|   193k|  w1 = _mm256_unpackhi_epi8(x[2], x[3]);
 4317|   193k|  w2 = _mm256_unpackhi_epi8(x[4], x[5]);
 4318|   193k|  w3 = _mm256_unpackhi_epi8(x[6], x[7]);
 4319|       |
 4320|   193k|  w8 = _mm256_unpackhi_epi8(x[8], x[9]);
 4321|   193k|  w9 = _mm256_unpackhi_epi8(x[10], x[11]);
 4322|   193k|  w10 = _mm256_unpackhi_epi8(x[12], x[13]);
 4323|   193k|  w11 = _mm256_unpackhi_epi8(x[14], x[15]);
 4324|       |
 4325|   193k|  w4 = _mm256_unpacklo_epi16(w0, w1);
 4326|   193k|  w5 = _mm256_unpacklo_epi16(w2, w3);
 4327|   193k|  w12 = _mm256_unpacklo_epi16(w8, w9);
 4328|   193k|  w13 = _mm256_unpacklo_epi16(w10, w11);
 4329|       |
 4330|   193k|  w6 = _mm256_unpacklo_epi32(w4, w5);
 4331|   193k|  w7 = _mm256_unpackhi_epi32(w4, w5);
 4332|   193k|  w14 = _mm256_unpacklo_epi32(w12, w13);
 4333|   193k|  w15 = _mm256_unpackhi_epi32(w12, w13);
 4334|       |
 4335|       |  // Store first 4-line result
 4336|   193k|  d[8] = _mm256_unpacklo_epi64(w6, w14);
 4337|   193k|  d[9] = _mm256_unpackhi_epi64(w6, w14);
 4338|   193k|  d[10] = _mm256_unpacklo_epi64(w7, w15);
 4339|   193k|  d[11] = _mm256_unpackhi_epi64(w7, w15);
 4340|       |
 4341|   193k|  w4 = _mm256_unpackhi_epi16(w0, w1);
 4342|   193k|  w5 = _mm256_unpackhi_epi16(w2, w3);
 4343|   193k|  w12 = _mm256_unpackhi_epi16(w8, w9);
 4344|   193k|  w13 = _mm256_unpackhi_epi16(w10, w11);
 4345|       |
 4346|   193k|  w6 = _mm256_unpacklo_epi32(w4, w5);
 4347|   193k|  w7 = _mm256_unpackhi_epi32(w4, w5);
 4348|   193k|  w14 = _mm256_unpacklo_epi32(w12, w13);
 4349|   193k|  w15 = _mm256_unpackhi_epi32(w12, w13);
 4350|       |
 4351|       |  // Store second 4-line result
 4352|   193k|  d[12] = _mm256_unpacklo_epi64(w6, w14);
 4353|   193k|  d[13] = _mm256_unpackhi_epi64(w6, w14);
 4354|   193k|  d[14] = _mm256_unpacklo_epi64(w7, w15);
 4355|   193k|  d[15] = _mm256_unpackhi_epi64(w7, w15);
 4356|   193k|}
intrapred_avx2.c:dr_prediction_z3_64x64_avx2:
 4573|  23.0k|                                        int dy) {
 4574|  23.0k|  DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
  ------------------
  |  |   19|  23.0k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 4575|  23.0k|  dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
 4576|  23.0k|  transpose(dstT, 64, dst, stride, 64, 64);
 4577|  23.0k|}
intrapred_avx2.c:dr_prediction_z3_4x8_avx2:
 4396|  23.9k|                                      int dy) {
 4397|  23.9k|  __m128i dstvec[4], d[8];
 4398|       |
 4399|  23.9k|  dr_prediction_z1_HxW_internal_avx2(8, 4, dstvec, left, upsample_left, dy);
 4400|  23.9k|  transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
 4401|  23.9k|                        &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
 4402|   215k|  for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (4402:19): [True: 191k, False: 23.9k]
  ------------------
 4403|   191k|    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
 4404|   191k|  }
 4405|  23.9k|}
intrapred_avx2.c:dr_prediction_z3_8x16_avx2:
 4424|  26.2k|                                       int dy) {
 4425|  26.2k|  __m128i dstvec[8], d[8];
 4426|       |
 4427|  26.2k|  dr_prediction_z1_HxW_internal_avx2(16, 8, dstvec, left, upsample_left, dy);
 4428|  26.2k|  transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
 4429|  26.2k|                          dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
 4430|  26.2k|                          d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
 4431|   236k|  for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (4431:19): [True: 210k, False: 26.2k]
  ------------------
 4432|   210k|    _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
 4433|   210k|    _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
 4434|   210k|                     _mm_srli_si128(d[i], 8));
 4435|   210k|  }
 4436|  26.2k|}
intrapred_avx2.c:dr_prediction_z3_16x32_avx2:
 4581|  21.2k|                                        int dy) {
 4582|  21.2k|  __m256i dstvec[16], d[16];
 4583|       |
 4584|  21.2k|  dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy);
 4585|  21.2k|  transpose16x32_avx2(dstvec, d);
 4586|       |  // store
 4587|   361k|  for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (4587:19): [True: 340k, False: 21.2k]
  ------------------
 4588|   340k|    _mm_storeu_si128((__m128i *)(dst + j * stride),
 4589|   340k|                     _mm256_castsi256_si128(d[j]));
 4590|   340k|    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
 4591|   340k|                     _mm256_extracti128_si256(d[j], 1));
 4592|   340k|  }
 4593|  21.2k|}
intrapred_avx2.c:dr_prediction_z3_32x64_avx2:
 4611|  1.55k|                                        int dy) {
 4612|  1.55k|  uint8_t dstT[64 * 32];
 4613|  1.55k|  dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
 4614|  1.55k|  transpose(dstT, 64, dst, stride, 32, 64);
 4615|  1.55k|}
intrapred_avx2.c:dr_prediction_z3_4x16_avx2:
 4458|  17.3k|                                       int dy) {
 4459|  17.3k|  __m128i dstvec[4], d[16];
 4460|       |
 4461|  17.3k|  dr_prediction_z1_HxW_internal_avx2(16, 4, dstvec, left, upsample_left, dy);
 4462|  17.3k|  transpose4x16_sse2(dstvec, d);
 4463|   295k|  for (int i = 0; i < 16; i++) {
  ------------------
  |  Branch (4463:19): [True: 278k, False: 17.3k]
  ------------------
 4464|   278k|    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
 4465|   278k|  }
 4466|  17.3k|}
intrapred_avx2.c:dr_prediction_z3_8x32_avx2:
 4490|  10.5k|                                       int dy) {
 4491|  10.5k|  __m256i dstvec[16], d[16];
 4492|       |
 4493|  10.5k|  dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy);
 4494|  94.7k|  for (int i = 8; i < 16; i++) {
  ------------------
  |  Branch (4494:19): [True: 84.2k, False: 10.5k]
  ------------------
 4495|  84.2k|    dstvec[i] = _mm256_setzero_si256();
 4496|  84.2k|  }
 4497|  10.5k|  transpose16x32_avx2(dstvec, d);
 4498|       |
 4499|   178k|  for (int i = 0; i < 16; i++) {
  ------------------
  |  Branch (4499:19): [True: 168k, False: 10.5k]
  ------------------
 4500|   168k|    _mm_storel_epi64((__m128i *)(dst + i * stride),
 4501|   168k|                     _mm256_castsi256_si128(d[i]));
 4502|   168k|  }
 4503|   178k|  for (int i = 0; i < 16; i++) {
  ------------------
  |  Branch (4503:19): [True: 168k, False: 10.5k]
  ------------------
 4504|   168k|    _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride),
 4505|   168k|                     _mm256_extracti128_si256(d[i], 1));
 4506|   168k|  }
 4507|  10.5k|}
intrapred_avx2.c:dr_prediction_z3_16x64_avx2:
 4629|  4.15k|                                        int dy) {
 4630|  4.15k|  uint8_t dstT[64 * 16];
 4631|  4.15k|  dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
 4632|  4.15k|  transpose(dstT, 64, dst, stride, 16, 64);
 4633|  4.15k|}
intrapred_avx2.c:dr_prediction_z3_8x4_avx2:
 4409|  40.7k|                                      int dy) {
 4410|  40.7k|  __m128i dstvec[8], d[4];
 4411|       |
 4412|  40.7k|  dr_prediction_z1_HxW_internal_avx2(4, 8, dstvec, left, upsample_left, dy);
 4413|  40.7k|  transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
 4414|  40.7k|                        &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
 4415|  40.7k|                        &d[1], &d[2], &d[3]);
 4416|  40.7k|  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
 4417|  40.7k|  _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
 4418|  40.7k|  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
 4419|  40.7k|  _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
 4420|  40.7k|}
intrapred_avx2.c:dr_prediction_z3_16x8_avx2:
 4440|  50.2k|                                       int dy) {
 4441|  50.2k|  __m128i dstvec[16], d[16];
 4442|       |
 4443|  50.2k|  dr_prediction_z1_HxW_internal_avx2(8, 16, dstvec, left, upsample_left, dy);
 4444|  50.2k|  transpose16x8_8x16_sse2(
 4445|  50.2k|      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
 4446|  50.2k|      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
 4447|  50.2k|      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
 4448|  50.2k|      &d[3], &d[4], &d[5], &d[6], &d[7]);
 4449|       |
 4450|   452k|  for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (4450:19): [True: 402k, False: 50.2k]
  ------------------
 4451|   402k|    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
 4452|   402k|  }
 4453|  50.2k|}
intrapred_avx2.c:dr_prediction_z3_32x16_avx2:
 4597|  21.0k|                                        int dy) {
 4598|  21.0k|  __m128i dstvec[32], d[16];
 4599|       |
 4600|  21.0k|  dr_prediction_z1_HxW_internal_avx2(16, 32, dstvec, left, upsample_left, dy);
 4601|  63.2k|  for (int i = 0; i < 32; i += 16) {
  ------------------
  |  Branch (4601:19): [True: 42.1k, False: 21.0k]
  ------------------
 4602|  42.1k|    transpose16x16_sse2((dstvec + i), d);
 4603|   717k|    for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (4603:21): [True: 674k, False: 42.1k]
  ------------------
 4604|   674k|      _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
 4605|   674k|    }
 4606|  42.1k|  }
 4607|  21.0k|}
intrapred_avx2.c:dr_prediction_z3_64x32_avx2:
 4619|  2.90k|                                        int dy) {
 4620|  2.90k|  uint8_t dstT[32 * 64];
 4621|  2.90k|  dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy);
 4622|  2.90k|  transpose(dstT, 32, dst, stride, 64, 32);
 4623|  2.90k|  return;
 4624|  2.90k|}
intrapred_avx2.c:dr_prediction_z3_16x4_avx2:
 4470|  55.1k|                                       int dy) {
 4471|  55.1k|  __m128i dstvec[16], d[8];
 4472|       |
 4473|  55.1k|  dr_prediction_z1_HxW_internal_avx2(4, 16, dstvec, left, upsample_left, dy);
 4474|   275k|  for (int i = 4; i < 8; i++) {
  ------------------
  |  Branch (4474:19): [True: 220k, False: 55.1k]
  ------------------
 4475|   220k|    d[i] = _mm_setzero_si128();
 4476|   220k|  }
 4477|  55.1k|  transpose16x8_8x16_sse2(
 4478|  55.1k|      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
 4479|  55.1k|      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
 4480|  55.1k|      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
 4481|  55.1k|      &d[3], &d[4], &d[5], &d[6], &d[7]);
 4482|       |
 4483|   275k|  for (int i = 0; i < 4; i++) {
  ------------------
  |  Branch (4483:19): [True: 220k, False: 55.1k]
  ------------------
 4484|   220k|    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
 4485|   220k|  }
 4486|  55.1k|}
intrapred_avx2.c:dr_prediction_z3_32x8_avx2:
 4511|  45.2k|                                       int dy) {
 4512|  45.2k|  __m128i dstvec[32], d[16];
 4513|       |
 4514|  45.2k|  dr_prediction_z1_HxW_internal_avx2(8, 32, dstvec, left, upsample_left, dy);
 4515|       |
 4516|  45.2k|  transpose16x8_8x16_sse2(
 4517|  45.2k|      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
 4518|  45.2k|      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
 4519|  45.2k|      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
 4520|  45.2k|      &d[3], &d[4], &d[5], &d[6], &d[7]);
 4521|  45.2k|  transpose16x8_8x16_sse2(
 4522|  45.2k|      &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16],
 4523|  45.2k|      &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16],
 4524|  45.2k|      &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16],
 4525|  45.2k|      &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16],
 4526|  45.2k|      &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8],
 4527|  45.2k|      &d[6 + 8], &d[7 + 8]);
 4528|       |
 4529|   406k|  for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (4529:19): [True: 361k, False: 45.2k]
  ------------------
 4530|   361k|    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
 4531|   361k|    _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]);
 4532|   361k|  }
 4533|  45.2k|}
intrapred_avx2.c:dr_prediction_z3_64x16_avx2:
 4637|  16.3k|                                        int dy) {
 4638|  16.3k|  __m128i dstvec[64], d[16];
 4639|       |
 4640|  16.3k|  dr_prediction_z1_HxW_internal_avx2(16, 64, dstvec, left, upsample_left, dy);
 4641|  81.5k|  for (int i = 0; i < 64; i += 16) {
  ------------------
  |  Branch (4641:19): [True: 65.2k, False: 16.3k]
  ------------------
 4642|  65.2k|    transpose16x16_sse2((dstvec + i), d);
 4643|  1.10M|    for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (4643:21): [True: 1.04M, False: 65.2k]
  ------------------
 4644|  1.04M|      _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
 4645|  1.04M|    }
 4646|  65.2k|  }
 4647|  16.3k|}

aom_dc_predictor_4x8_sse2:
  110|   277k|                               const uint8_t *above, const uint8_t *left) {
  111|   277k|  const __m128i sum_left = dc_sum_8(left);
  112|   277k|  __m128i sum_above = dc_sum_4(above);
  113|   277k|  sum_above = _mm_add_epi16(sum_left, sum_above);
  114|       |
  115|   277k|  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
  116|   277k|  sum += 6;
  117|   277k|  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
  ------------------
  |  |   95|   277k|#define DC_MULTIPLIER_1X2 0x5556
  ------------------
  118|       |
  119|   277k|  const __m128i row = _mm_set1_epi8((int8_t)sum);
  120|   277k|  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row);
  121|   277k|  dc_store_4xh(pred, 8, dst, stride);
  122|   277k|}
aom_dc_predictor_4x16_sse2:
  126|   182k|                                const uint8_t *above, const uint8_t *left) {
  127|   182k|  const __m128i sum_left = dc_sum_16_sse2(left);
  128|   182k|  __m128i sum_above = dc_sum_4(above);
  129|   182k|  sum_above = _mm_add_epi16(sum_left, sum_above);
  130|       |
  131|   182k|  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
  132|   182k|  sum += 10;
  133|   182k|  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
  ------------------
  |  |   96|   182k|#define DC_MULTIPLIER_1X4 0x3334
  ------------------
  134|       |
  135|   182k|  const __m128i row = _mm_set1_epi8((int8_t)sum);
  136|   182k|  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row);
  137|   182k|  dc_store_4xh(pred, 16, dst, stride);
  138|   182k|}
aom_dc_predictor_8x4_sse2:
  142|   422k|                               const uint8_t *above, const uint8_t *left) {
  143|   422k|  const __m128i sum_left = dc_sum_4(left);
  144|   422k|  __m128i sum_above = dc_sum_8(above);
  145|   422k|  sum_above = _mm_add_epi16(sum_above, sum_left);
  146|       |
  147|   422k|  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
  148|   422k|  sum += 6;
  149|   422k|  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
  ------------------
  |  |   95|   422k|#define DC_MULTIPLIER_1X2 0x5556
  ------------------
  150|       |
  151|   422k|  const __m128i row = _mm_set1_epi8((int8_t)sum);
  152|   422k|  dc_store_8xh(&row, 4, dst, stride);
  153|   422k|}
aom_dc_predictor_8x16_sse2:
  156|   214k|                                const uint8_t *above, const uint8_t *left) {
  157|   214k|  const __m128i sum_left = dc_sum_16_sse2(left);
  158|   214k|  __m128i sum_above = dc_sum_8(above);
  159|   214k|  sum_above = _mm_add_epi16(sum_above, sum_left);
  160|       |
  161|   214k|  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
  162|   214k|  sum += 12;
  163|   214k|  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
  ------------------
  |  |   95|   214k|#define DC_MULTIPLIER_1X2 0x5556
  ------------------
  164|   214k|  const __m128i row = _mm_set1_epi8((int8_t)sum);
  165|   214k|  dc_store_8xh(&row, 16, dst, stride);
  166|   214k|}
aom_dc_predictor_8x32_sse2:
  170|   102k|                                const uint8_t *above, const uint8_t *left) {
  171|   102k|  const __m128i sum_left = dc_sum_32_sse2(left);
  172|   102k|  __m128i sum_above = dc_sum_8(above);
  173|   102k|  sum_above = _mm_add_epi16(sum_above, sum_left);
  174|       |
  175|   102k|  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
  176|   102k|  sum += 20;
  177|   102k|  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
  ------------------
  |  |   96|   102k|#define DC_MULTIPLIER_1X4 0x3334
  ------------------
  178|   102k|  const __m128i row = _mm_set1_epi8((int8_t)sum);
  179|   102k|  dc_store_8xh(&row, 32, dst, stride);
  180|   102k|}
aom_dc_predictor_16x4_sse2:
  183|   619k|                                const uint8_t *above, const uint8_t *left) {
  184|   619k|  const __m128i sum_left = dc_sum_4(left);
  185|   619k|  __m128i sum_above = dc_sum_16_sse2(above);
  186|   619k|  sum_above = _mm_add_epi16(sum_above, sum_left);
  187|       |
  188|   619k|  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
  189|   619k|  sum += 10;
  190|   619k|  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
  ------------------
  |  |   96|   619k|#define DC_MULTIPLIER_1X4 0x3334
  ------------------
  191|   619k|  const __m128i row = _mm_set1_epi8((int8_t)sum);
  192|   619k|  dc_store_16xh(&row, 4, dst, stride);
  193|   619k|}
aom_dc_predictor_16x8_sse2:
  197|   349k|                                const uint8_t *above, const uint8_t *left) {
  198|   349k|  const __m128i sum_left = dc_sum_8(left);
  199|   349k|  __m128i sum_above = dc_sum_16_sse2(above);
  200|   349k|  sum_above = _mm_add_epi16(sum_above, sum_left);
  201|       |
  202|   349k|  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
  203|   349k|  sum += 12;
  204|   349k|  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
  ------------------
  |  |   95|   349k|#define DC_MULTIPLIER_1X2 0x5556
  ------------------
  205|   349k|  const __m128i row = _mm_set1_epi8((int8_t)sum);
  206|   349k|  dc_store_16xh(&row, 8, dst, stride);
  207|   349k|}
aom_dc_predictor_16x32_sse2:
  210|   143k|                                 const uint8_t *above, const uint8_t *left) {
  211|   143k|  const __m128i sum_left = dc_sum_32_sse2(left);
  212|   143k|  __m128i sum_above = dc_sum_16_sse2(above);
  213|   143k|  sum_above = _mm_add_epi16(sum_left, sum_above);
  214|       |
  215|   143k|  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
  216|   143k|  sum += 24;
  217|   143k|  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
  ------------------
  |  |   95|   143k|#define DC_MULTIPLIER_1X2 0x5556
  ------------------
  218|   143k|  const __m128i row = _mm_set1_epi8((int8_t)sum);
  219|   143k|  dc_store_16xh(&row, 32, dst, stride);
  220|   143k|}
aom_dc_predictor_16x64_sse2:
  224|  18.5k|                                 const uint8_t *above, const uint8_t *left) {
  225|  18.5k|  const __m128i sum_left = dc_sum_64(left);
  226|  18.5k|  __m128i sum_above = dc_sum_16_sse2(above);
  227|  18.5k|  sum_above = _mm_add_epi16(sum_left, sum_above);
  228|       |
  229|  18.5k|  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
  230|  18.5k|  sum += 40;
  231|  18.5k|  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
  ------------------
  |  |   96|  18.5k|#define DC_MULTIPLIER_1X4 0x3334
  ------------------
  232|  18.5k|  const __m128i row = _mm_set1_epi8((int8_t)sum);
  233|  18.5k|  dc_store_16xh(&row, 64, dst, stride);
  234|  18.5k|}
aom_dc_predictor_32x8_sse2:
  237|   374k|                                const uint8_t *above, const uint8_t *left) {
  238|   374k|  __m128i sum_above = dc_sum_32_sse2(above);
  239|   374k|  const __m128i sum_left = dc_sum_8(left);
  240|   374k|  sum_above = _mm_add_epi16(sum_above, sum_left);
  241|       |
  242|   374k|  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
  243|   374k|  sum += 20;
  244|   374k|  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
  ------------------
  |  |   96|   374k|#define DC_MULTIPLIER_1X4 0x3334
  ------------------
  245|   374k|  const __m128i row = _mm_set1_epi8((int8_t)sum);
  246|   374k|  dc_store_32xh(&row, 8, dst, stride);
  247|   374k|}
aom_dc_top_predictor_4x8_sse2:
  321|  4.33k|                                   const uint8_t *above, const uint8_t *left) {
  322|  4.33k|  (void)left;
  323|  4.33k|  __m128i sum_above = dc_sum_4(above);
  324|  4.33k|  const __m128i two = _mm_set1_epi16(2);
  325|  4.33k|  sum_above = _mm_add_epi16(sum_above, two);
  326|  4.33k|  sum_above = _mm_srai_epi16(sum_above, 2);
  327|  4.33k|  sum_above = _mm_shufflelo_epi16(sum_above, 0);
  328|  4.33k|  sum_above = _mm_packus_epi16(sum_above, sum_above);
  329|       |
  330|  4.33k|  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above);
  331|  4.33k|  dc_store_4xh(pred, 8, dst, stride);
  332|  4.33k|}
aom_dc_top_predictor_4x16_sse2:
  336|  2.15k|                                    const uint8_t *above, const uint8_t *left) {
  337|  2.15k|  (void)left;
  338|  2.15k|  __m128i sum_above = dc_sum_4(above);
  339|  2.15k|  const __m128i two = _mm_set1_epi16(2);
  340|  2.15k|  sum_above = _mm_add_epi16(sum_above, two);
  341|  2.15k|  sum_above = _mm_srai_epi16(sum_above, 2);
  342|  2.15k|  sum_above = _mm_shufflelo_epi16(sum_above, 0);
  343|  2.15k|  sum_above = _mm_packus_epi16(sum_above, sum_above);
  344|       |
  345|  2.15k|  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above);
  346|  2.15k|  dc_store_4xh(pred, 16, dst, stride);
  347|  2.15k|}
aom_dc_top_predictor_8x4_sse2:
  351|  2.44k|                                   const uint8_t *above, const uint8_t *left) {
  352|  2.44k|  (void)left;
  353|  2.44k|  __m128i sum_above = dc_sum_8(above);
  354|  2.44k|  const __m128i four = _mm_set1_epi16(4);
  355|  2.44k|  sum_above = _mm_add_epi16(sum_above, four);
  356|  2.44k|  sum_above = _mm_srai_epi16(sum_above, 3);
  357|  2.44k|  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
  358|  2.44k|  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
  359|  2.44k|  dc_store_8xh(&row, 4, dst, stride);
  360|  2.44k|}
aom_dc_top_predictor_8x16_sse2:
  363|  3.72k|                                    const uint8_t *above, const uint8_t *left) {
  364|  3.72k|  (void)left;
  365|  3.72k|  __m128i sum_above = dc_sum_8(above);
  366|  3.72k|  const __m128i four = _mm_set1_epi16(4);
  367|  3.72k|  sum_above = _mm_add_epi16(sum_above, four);
  368|  3.72k|  sum_above = _mm_srai_epi16(sum_above, 3);
  369|  3.72k|  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
  370|  3.72k|  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
  371|  3.72k|  dc_store_8xh(&row, 16, dst, stride);
  372|  3.72k|}
aom_dc_top_predictor_8x32_sse2:
  376|  5.61k|                                    const uint8_t *above, const uint8_t *left) {
  377|  5.61k|  (void)left;
  378|  5.61k|  __m128i sum_above = dc_sum_8(above);
  379|  5.61k|  const __m128i four = _mm_set1_epi16(4);
  380|  5.61k|  sum_above = _mm_add_epi16(sum_above, four);
  381|  5.61k|  sum_above = _mm_srai_epi16(sum_above, 3);
  382|  5.61k|  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
  383|  5.61k|  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
  384|  5.61k|  dc_store_8xh(&row, 32, dst, stride);
  385|  5.61k|}
aom_dc_top_predictor_16x4_sse2:
  388|  5.28k|                                    const uint8_t *above, const uint8_t *left) {
  389|  5.28k|  (void)left;
  390|  5.28k|  __m128i sum_above = dc_sum_16_sse2(above);
  391|  5.28k|  const __m128i eight = _mm_set1_epi16(8);
  392|  5.28k|  sum_above = _mm_add_epi16(sum_above, eight);
  393|  5.28k|  sum_above = _mm_srai_epi16(sum_above, 4);
  394|  5.28k|  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
  395|  5.28k|  sum_above = _mm_shufflelo_epi16(sum_above, 0);
  396|  5.28k|  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
  397|  5.28k|  dc_store_16xh(&row, 4, dst, stride);
  398|  5.28k|}
aom_dc_top_predictor_16x8_sse2:
  402|  1.64k|                                    const uint8_t *above, const uint8_t *left) {
  403|  1.64k|  (void)left;
  404|  1.64k|  __m128i sum_above = dc_sum_16_sse2(above);
  405|  1.64k|  const __m128i eight = _mm_set1_epi16(8);
  406|  1.64k|  sum_above = _mm_add_epi16(sum_above, eight);
  407|  1.64k|  sum_above = _mm_srai_epi16(sum_above, 4);
  408|  1.64k|  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
  409|  1.64k|  sum_above = _mm_shufflelo_epi16(sum_above, 0);
  410|  1.64k|  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
  411|  1.64k|  dc_store_16xh(&row, 8, dst, stride);
  412|  1.64k|}
aom_dc_top_predictor_16x32_sse2:
  416|  5.49k|                                     const uint8_t *left) {
  417|  5.49k|  (void)left;
  418|  5.49k|  __m128i sum_above = dc_sum_16_sse2(above);
  419|  5.49k|  const __m128i eight = _mm_set1_epi16(8);
  420|  5.49k|  sum_above = _mm_add_epi16(sum_above, eight);
  421|  5.49k|  sum_above = _mm_srai_epi16(sum_above, 4);
  422|  5.49k|  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
  423|  5.49k|  sum_above = _mm_shufflelo_epi16(sum_above, 0);
  424|  5.49k|  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
  425|  5.49k|  dc_store_16xh(&row, 32, dst, stride);
  426|  5.49k|}
aom_dc_top_predictor_16x64_sse2:
  431|    238|                                     const uint8_t *left) {
  432|    238|  (void)left;
  433|    238|  __m128i sum_above = dc_sum_16_sse2(above);
  434|    238|  const __m128i eight = _mm_set1_epi16(8);
  435|    238|  sum_above = _mm_add_epi16(sum_above, eight);
  436|    238|  sum_above = _mm_srai_epi16(sum_above, 4);
  437|    238|  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
  438|    238|  sum_above = _mm_shufflelo_epi16(sum_above, 0);
  439|    238|  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
  440|    238|  dc_store_16xh(&row, 64, dst, stride);
  441|    238|}
aom_dc_top_predictor_32x8_sse2:
  444|  8.78k|                                    const uint8_t *above, const uint8_t *left) {
  445|  8.78k|  (void)left;
  446|  8.78k|  __m128i sum_above = dc_sum_32_sse2(above);
  447|  8.78k|  const __m128i sixteen = _mm_set1_epi16(16);
  448|  8.78k|  sum_above = _mm_add_epi16(sum_above, sixteen);
  449|  8.78k|  sum_above = _mm_srai_epi16(sum_above, 5);
  450|  8.78k|  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
  451|  8.78k|  sum_above = _mm_shufflelo_epi16(sum_above, 0);
  452|  8.78k|  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
  453|  8.78k|  dc_store_32xh(&row, 8, dst, stride);
  454|  8.78k|}
aom_dc_left_predictor_4x8_sse2:
  533|  4.61k|                                    const uint8_t *above, const uint8_t *left) {
  534|  4.61k|  (void)above;
  535|  4.61k|  __m128i sum_left = dc_sum_8(left);
  536|  4.61k|  const __m128i four = _mm_set1_epi16(4);
  537|  4.61k|  sum_left = _mm_add_epi16(sum_left, four);
  538|  4.61k|  sum_left = _mm_srai_epi16(sum_left, 3);
  539|  4.61k|  sum_left = _mm_shufflelo_epi16(sum_left, 0);
  540|  4.61k|  sum_left = _mm_packus_epi16(sum_left, sum_left);
  541|       |
  542|  4.61k|  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left);
  543|  4.61k|  dc_store_4xh(pred, 8, dst, stride);
  544|  4.61k|}
aom_dc_left_predictor_4x16_sse2:
  549|  3.94k|                                     const uint8_t *left) {
  550|  3.94k|  (void)above;
  551|  3.94k|  __m128i sum_left = dc_sum_16_sse2(left);
  552|  3.94k|  const __m128i eight = _mm_set1_epi16(8);
  553|  3.94k|  sum_left = _mm_add_epi16(sum_left, eight);
  554|  3.94k|  sum_left = _mm_srai_epi16(sum_left, 4);
  555|  3.94k|  sum_left = _mm_shufflelo_epi16(sum_left, 0);
  556|  3.94k|  sum_left = _mm_packus_epi16(sum_left, sum_left);
  557|       |
  558|  3.94k|  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left);
  559|  3.94k|  dc_store_4xh(pred, 16, dst, stride);
  560|  3.94k|}
aom_dc_left_predictor_8x4_sse2:
  564|  4.93k|                                    const uint8_t *above, const uint8_t *left) {
  565|  4.93k|  (void)above;
  566|  4.93k|  __m128i sum_left = dc_sum_4(left);
  567|  4.93k|  const __m128i two = _mm_set1_epi16(2);
  568|  4.93k|  sum_left = _mm_add_epi16(sum_left, two);
  569|  4.93k|  sum_left = _mm_srai_epi16(sum_left, 2);
  570|  4.93k|  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
  571|  4.93k|  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
  572|  4.93k|  dc_store_8xh(&row, 4, dst, stride);
  573|  4.93k|}
aom_dc_left_predictor_8x16_sse2:
  577|  2.84k|                                     const uint8_t *left) {
  578|  2.84k|  (void)above;
  579|  2.84k|  __m128i sum_left = dc_sum_16_sse2(left);
  580|  2.84k|  const __m128i eight = _mm_set1_epi16(8);
  581|  2.84k|  sum_left = _mm_add_epi16(sum_left, eight);
  582|  2.84k|  sum_left = _mm_srai_epi16(sum_left, 4);
  583|  2.84k|  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
  584|  2.84k|  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
  585|  2.84k|  dc_store_8xh(&row, 16, dst, stride);
  586|  2.84k|}
aom_dc_left_predictor_8x32_sse2:
  591|  6.21k|                                     const uint8_t *left) {
  592|  6.21k|  (void)above;
  593|  6.21k|  __m128i sum_left = dc_sum_32_sse2(left);
  594|  6.21k|  const __m128i sixteen = _mm_set1_epi16(16);
  595|  6.21k|  sum_left = _mm_add_epi16(sum_left, sixteen);
  596|  6.21k|  sum_left = _mm_srai_epi16(sum_left, 5);
  597|  6.21k|  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
  598|  6.21k|  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
  599|  6.21k|  dc_store_8xh(&row, 32, dst, stride);
  600|  6.21k|}
aom_dc_left_predictor_16x4_sse2:
  604|  3.75k|                                     const uint8_t *left) {
  605|  3.75k|  (void)above;
  606|  3.75k|  __m128i sum_left = dc_sum_4(left);
  607|  3.75k|  const __m128i two = _mm_set1_epi16(2);
  608|  3.75k|  sum_left = _mm_add_epi16(sum_left, two);
  609|  3.75k|  sum_left = _mm_srai_epi16(sum_left, 2);
  610|  3.75k|  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
  611|  3.75k|  sum_left = _mm_shufflelo_epi16(sum_left, 0);
  612|  3.75k|  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
  613|  3.75k|  dc_store_16xh(&row, 4, dst, stride);
  614|  3.75k|}
aom_dc_left_predictor_16x8_sse2:
  619|  7.97k|                                     const uint8_t *left) {
  620|  7.97k|  (void)above;
  621|  7.97k|  __m128i sum_left = dc_sum_8(left);
  622|  7.97k|  const __m128i four = _mm_set1_epi16(4);
  623|  7.97k|  sum_left = _mm_add_epi16(sum_left, four);
  624|  7.97k|  sum_left = _mm_srai_epi16(sum_left, 3);
  625|  7.97k|  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
  626|  7.97k|  sum_left = _mm_shufflelo_epi16(sum_left, 0);
  627|  7.97k|  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
  628|  7.97k|  dc_store_16xh(&row, 8, dst, stride);
  629|  7.97k|}
aom_dc_left_predictor_16x32_sse2:
  633|  4.50k|                                      const uint8_t *left) {
  634|  4.50k|  (void)above;
  635|  4.50k|  __m128i sum_left = dc_sum_32_sse2(left);
  636|  4.50k|  const __m128i sixteen = _mm_set1_epi16(16);
  637|  4.50k|  sum_left = _mm_add_epi16(sum_left, sixteen);
  638|  4.50k|  sum_left = _mm_srai_epi16(sum_left, 5);
  639|  4.50k|  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
  640|  4.50k|  sum_left = _mm_shufflelo_epi16(sum_left, 0);
  641|  4.50k|  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
  642|  4.50k|  dc_store_16xh(&row, 32, dst, stride);
  643|  4.50k|}
aom_dc_left_predictor_16x64_sse2:
  648|    458|                                      const uint8_t *left) {
  649|    458|  (void)above;
  650|    458|  __m128i sum_left = dc_sum_64(left);
  651|    458|  const __m128i thirtytwo = _mm_set1_epi16(32);
  652|    458|  sum_left = _mm_add_epi16(sum_left, thirtytwo);
  653|    458|  sum_left = _mm_srai_epi16(sum_left, 6);
  654|    458|  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
  655|    458|  sum_left = _mm_shufflelo_epi16(sum_left, 0);
  656|    458|  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
  657|    458|  dc_store_16xh(&row, 64, dst, stride);
  658|    458|}
aom_dc_left_predictor_32x8_sse2:
  662|  2.44k|                                     const uint8_t *left) {
  663|  2.44k|  (void)above;
  664|  2.44k|  __m128i sum_left = dc_sum_8(left);
  665|  2.44k|  const __m128i four = _mm_set1_epi16(4);
  666|  2.44k|  sum_left = _mm_add_epi16(sum_left, four);
  667|  2.44k|  sum_left = _mm_srai_epi16(sum_left, 3);
  668|  2.44k|  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
  669|  2.44k|  sum_left = _mm_shufflelo_epi16(sum_left, 0);
  670|  2.44k|  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
  671|  2.44k|  dc_store_32xh(&row, 8, dst, stride);
  672|  2.44k|}
aom_dc_128_predictor_4x8_sse2:
  751|  5.41k|                                   const uint8_t *above, const uint8_t *left) {
  752|  5.41k|  (void)above;
  753|  5.41k|  (void)left;
  754|  5.41k|  const uint32_t pred = 0x80808080;
  755|  5.41k|  dc_store_4xh(pred, 8, dst, stride);
  756|  5.41k|}
aom_dc_128_predictor_4x16_sse2:
  760|    164|                                    const uint8_t *above, const uint8_t *left) {
  761|    164|  (void)above;
  762|    164|  (void)left;
  763|    164|  const uint32_t pred = 0x80808080;
  764|    164|  dc_store_4xh(pred, 16, dst, stride);
  765|    164|}
aom_dc_128_predictor_8x4_sse2:
  769|    289|                                   const uint8_t *above, const uint8_t *left) {
  770|    289|  (void)above;
  771|    289|  (void)left;
  772|    289|  const __m128i row = _mm_set1_epi8((int8_t)128);
  773|    289|  dc_store_8xh(&row, 4, dst, stride);
  774|    289|}
aom_dc_128_predictor_8x16_sse2:
  777|  1.73k|                                    const uint8_t *above, const uint8_t *left) {
  778|  1.73k|  (void)above;
  779|  1.73k|  (void)left;
  780|  1.73k|  const __m128i row = _mm_set1_epi8((int8_t)128);
  781|  1.73k|  dc_store_8xh(&row, 16, dst, stride);
  782|  1.73k|}
aom_dc_128_predictor_8x32_sse2:
  786|    120|                                    const uint8_t *above, const uint8_t *left) {
  787|    120|  (void)above;
  788|    120|  (void)left;
  789|    120|  const __m128i row = _mm_set1_epi8((int8_t)128);
  790|    120|  dc_store_8xh(&row, 32, dst, stride);
  791|    120|}
aom_dc_128_predictor_16x4_sse2:
  794|    106|                                    const uint8_t *above, const uint8_t *left) {
  795|    106|  (void)above;
  796|    106|  (void)left;
  797|    106|  const __m128i row = _mm_set1_epi8((int8_t)128);
  798|    106|  dc_store_16xh(&row, 4, dst, stride);
  799|    106|}
aom_dc_128_predictor_16x8_sse2:
  803|    189|                                    const uint8_t *above, const uint8_t *left) {
  804|    189|  (void)above;
  805|    189|  (void)left;
  806|    189|  const __m128i row = _mm_set1_epi8((int8_t)128);
  807|    189|  dc_store_16xh(&row, 8, dst, stride);
  808|    189|}
aom_dc_128_predictor_16x32_sse2:
  812|  2.37k|                                     const uint8_t *left) {
  813|  2.37k|  (void)above;
  814|  2.37k|  (void)left;
  815|  2.37k|  const __m128i row = _mm_set1_epi8((int8_t)128);
  816|  2.37k|  dc_store_16xh(&row, 32, dst, stride);
  817|  2.37k|}
aom_dc_128_predictor_16x64_sse2:
  822|     34|                                     const uint8_t *left) {
  823|     34|  (void)above;
  824|     34|  (void)left;
  825|     34|  const __m128i row = _mm_set1_epi8((int8_t)128);
  826|     34|  dc_store_16xh(&row, 64, dst, stride);
  827|     34|}
aom_dc_128_predictor_32x8_sse2:
  830|     90|                                    const uint8_t *above, const uint8_t *left) {
  831|     90|  (void)above;
  832|     90|  (void)left;
  833|     90|  const __m128i row = _mm_set1_epi8((int8_t)128);
  834|     90|  dc_store_32xh(&row, 8, dst, stride);
  835|     90|}
aom_v_predictor_4x8_sse2:
  889|  25.7k|                              const uint8_t *above, const uint8_t *left) {
  890|  25.7k|  const uint32_t pred = *(uint32_t *)above;
  891|  25.7k|  (void)left;
  892|  25.7k|  dc_store_4xh(pred, 8, dst, stride);
  893|  25.7k|}
aom_v_predictor_4x16_sse2:
  897|  7.40k|                               const uint8_t *above, const uint8_t *left) {
  898|  7.40k|  const uint32_t pred = *(uint32_t *)above;
  899|  7.40k|  (void)left;
  900|  7.40k|  dc_store_4xh(pred, 16, dst, stride);
  901|  7.40k|}
aom_v_predictor_8x4_sse2:
  905|  38.8k|                              const uint8_t *above, const uint8_t *left) {
  906|  38.8k|  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
  907|  38.8k|  (void)left;
  908|  38.8k|  dc_store_8xh(&row, 4, dst, stride);
  909|  38.8k|}
aom_v_predictor_8x16_sse2:
  912|  18.9k|                               const uint8_t *above, const uint8_t *left) {
  913|  18.9k|  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
  914|  18.9k|  (void)left;
  915|  18.9k|  dc_store_8xh(&row, 16, dst, stride);
  916|  18.9k|}
aom_v_predictor_8x32_sse2:
  920|  4.16k|                               const uint8_t *above, const uint8_t *left) {
  921|  4.16k|  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
  922|  4.16k|  (void)left;
  923|  4.16k|  dc_store_8xh(&row, 32, dst, stride);
  924|  4.16k|}
aom_v_predictor_16x4_sse2:
  927|  32.3k|                               const uint8_t *above, const uint8_t *left) {
  928|  32.3k|  const __m128i row = _mm_load_si128((__m128i const *)above);
  929|  32.3k|  (void)left;
  930|  32.3k|  dc_store_16xh(&row, 4, dst, stride);
  931|  32.3k|}
aom_v_predictor_16x8_sse2:
  935|  32.1k|                               const uint8_t *above, const uint8_t *left) {
  936|  32.1k|  const __m128i row = _mm_load_si128((__m128i const *)above);
  937|  32.1k|  (void)left;
  938|  32.1k|  dc_store_16xh(&row, 8, dst, stride);
  939|  32.1k|}
aom_v_predictor_16x32_sse2:
  942|  9.48k|                                const uint8_t *above, const uint8_t *left) {
  943|  9.48k|  const __m128i row = _mm_load_si128((__m128i const *)above);
  944|  9.48k|  (void)left;
  945|  9.48k|  dc_store_16xh(&row, 32, dst, stride);
  946|  9.48k|}
aom_v_predictor_16x64_sse2:
  950|  1.65k|                                const uint8_t *above, const uint8_t *left) {
  951|  1.65k|  const __m128i row = _mm_load_si128((__m128i const *)above);
  952|  1.65k|  (void)left;
  953|  1.65k|  dc_store_16xh(&row, 64, dst, stride);
  954|  1.65k|}
aom_v_predictor_32x8_sse2:
  970|  14.1k|                               const uint8_t *above, const uint8_t *left) {
  971|  14.1k|  (void)left;
  972|  14.1k|  v_predictor_32xh(dst, stride, above, 8);
  973|  14.1k|}
aom_h_predictor_4x8_sse2:
 1027|  39.4k|                              const uint8_t *above, const uint8_t *left) {
 1028|  39.4k|  (void)above;
 1029|  39.4k|  __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
 1030|  39.4k|  left_col = _mm_unpacklo_epi8(left_col, left_col);
 1031|  39.4k|  __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
 1032|  39.4k|  __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
 1033|  39.4k|  __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
 1034|  39.4k|  __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
 1035|  39.4k|  *(int *)dst = _mm_cvtsi128_si32(row0);
 1036|  39.4k|  dst += stride;
 1037|  39.4k|  *(int *)dst = _mm_cvtsi128_si32(row1);
 1038|  39.4k|  dst += stride;
 1039|  39.4k|  *(int *)dst = _mm_cvtsi128_si32(row2);
 1040|  39.4k|  dst += stride;
 1041|  39.4k|  *(int *)dst = _mm_cvtsi128_si32(row3);
 1042|  39.4k|  dst += stride;
 1043|  39.4k|  left_col = _mm_unpackhi_epi64(left_col, left_col);
 1044|  39.4k|  row0 = _mm_shufflelo_epi16(left_col, 0);
 1045|  39.4k|  row1 = _mm_shufflelo_epi16(left_col, 0x55);
 1046|  39.4k|  row2 = _mm_shufflelo_epi16(left_col, 0xaa);
 1047|  39.4k|  row3 = _mm_shufflelo_epi16(left_col, 0xff);
 1048|  39.4k|  *(int *)dst = _mm_cvtsi128_si32(row0);
 1049|  39.4k|  dst += stride;
 1050|  39.4k|  *(int *)dst = _mm_cvtsi128_si32(row1);
 1051|  39.4k|  dst += stride;
 1052|  39.4k|  *(int *)dst = _mm_cvtsi128_si32(row2);
 1053|  39.4k|  dst += stride;
 1054|  39.4k|  *(int *)dst = _mm_cvtsi128_si32(row3);
 1055|  39.4k|}
aom_h_predictor_4x16_sse2:
 1059|  20.7k|                               const uint8_t *above, const uint8_t *left) {
 1060|  20.7k|  (void)above;
 1061|  20.7k|  const __m128i left_col = _mm_load_si128((__m128i const *)left);
 1062|  20.7k|  __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
 1063|  20.7k|  __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
 1064|       |
 1065|  20.7k|  __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
 1066|  20.7k|  __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
 1067|  20.7k|  __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
 1068|  20.7k|  __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
 1069|  20.7k|  *(int *)dst = _mm_cvtsi128_si32(row0);
 1070|  20.7k|  dst += stride;
 1071|  20.7k|  *(int *)dst = _mm_cvtsi128_si32(row1);
 1072|  20.7k|  dst += stride;
 1073|  20.7k|  *(int *)dst = _mm_cvtsi128_si32(row2);
 1074|  20.7k|  dst += stride;
 1075|  20.7k|  *(int *)dst = _mm_cvtsi128_si32(row3);
 1076|  20.7k|  dst += stride;
 1077|       |
 1078|  20.7k|  left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
 1079|  20.7k|  row0 = _mm_shufflelo_epi16(left_col_low, 0);
 1080|  20.7k|  row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
 1081|  20.7k|  row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
 1082|  20.7k|  row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
 1083|  20.7k|  *(int *)dst = _mm_cvtsi128_si32(row0);
 1084|  20.7k|  dst += stride;
 1085|  20.7k|  *(int *)dst = _mm_cvtsi128_si32(row1);
 1086|  20.7k|  dst += stride;
 1087|  20.7k|  *(int *)dst = _mm_cvtsi128_si32(row2);
 1088|  20.7k|  dst += stride;
 1089|  20.7k|  *(int *)dst = _mm_cvtsi128_si32(row3);
 1090|  20.7k|  dst += stride;
 1091|       |
 1092|  20.7k|  row0 = _mm_shufflelo_epi16(left_col_high, 0);
 1093|  20.7k|  row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
 1094|  20.7k|  row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
 1095|  20.7k|  row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
 1096|  20.7k|  *(int *)dst = _mm_cvtsi128_si32(row0);
 1097|  20.7k|  dst += stride;
 1098|  20.7k|  *(int *)dst = _mm_cvtsi128_si32(row1);
 1099|  20.7k|  dst += stride;
 1100|  20.7k|  *(int *)dst = _mm_cvtsi128_si32(row2);
 1101|  20.7k|  dst += stride;
 1102|  20.7k|  *(int *)dst = _mm_cvtsi128_si32(row3);
 1103|  20.7k|  dst += stride;
 1104|       |
 1105|  20.7k|  left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
 1106|  20.7k|  row0 = _mm_shufflelo_epi16(left_col_high, 0);
 1107|  20.7k|  row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
 1108|  20.7k|  row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
 1109|  20.7k|  row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
 1110|  20.7k|  *(int *)dst = _mm_cvtsi128_si32(row0);
 1111|  20.7k|  dst += stride;
 1112|  20.7k|  *(int *)dst = _mm_cvtsi128_si32(row1);
 1113|  20.7k|  dst += stride;
 1114|  20.7k|  *(int *)dst = _mm_cvtsi128_si32(row2);
 1115|  20.7k|  dst += stride;
 1116|  20.7k|  *(int *)dst = _mm_cvtsi128_si32(row3);
 1117|  20.7k|}
aom_h_predictor_8x4_sse2:
 1121|  74.3k|                              const uint8_t *above, const uint8_t *left) {
 1122|  74.3k|  (void)above;
 1123|  74.3k|  __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
 1124|  74.3k|  left_col = _mm_unpacklo_epi8(left_col, left_col);
 1125|  74.3k|  __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
 1126|  74.3k|  __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
 1127|  74.3k|  __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
 1128|  74.3k|  __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
 1129|  74.3k|  _mm_storel_epi64((__m128i *)dst, row0);
 1130|  74.3k|  dst += stride;
 1131|  74.3k|  _mm_storel_epi64((__m128i *)dst, row1);
 1132|  74.3k|  dst += stride;
 1133|  74.3k|  _mm_storel_epi64((__m128i *)dst, row2);
 1134|  74.3k|  dst += stride;
 1135|  74.3k|  _mm_storel_epi64((__m128i *)dst, row3);
 1136|  74.3k|}
aom_h_predictor_8x16_sse2:
 1205|  40.2k|                               const uint8_t *above, const uint8_t *left) {
 1206|  40.2k|  h_predictor_8x16xc(dst, stride, above, left, 1);
 1207|  40.2k|}
aom_h_predictor_8x32_sse2:
 1211|  9.56k|                               const uint8_t *above, const uint8_t *left) {
 1212|  9.56k|  h_predictor_8x16xc(dst, stride, above, left, 2);
 1213|  9.56k|}
aom_h_predictor_16x4_sse2:
 1269|   154k|                               const uint8_t *above, const uint8_t *left) {
 1270|   154k|  (void)above;
 1271|   154k|  const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
 1272|   154k|  const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
 1273|   154k|  h_prediction_16x8_1(&left_col_8p, dst, stride);
 1274|   154k|}
aom_h_predictor_16x8_sse2:
 1278|  90.2k|                               const uint8_t *above, const uint8_t *left) {
 1279|  90.2k|  (void)above;
 1280|  90.2k|  const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
 1281|  90.2k|  const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
 1282|  90.2k|  h_prediction_16x8_1(&left_col_8p, dst, stride);
 1283|  90.2k|  dst += stride << 2;
 1284|  90.2k|  h_prediction_16x8_2(&left_col_8p, dst, stride);
 1285|  90.2k|}
aom_h_predictor_16x32_sse2:
 1310|  23.0k|                                const uint8_t *above, const uint8_t *left) {
 1311|  23.0k|  (void)above;
 1312|  23.0k|  h_predictor_16xh(dst, stride, left, 2);
 1313|  23.0k|}
aom_h_predictor_16x64_sse2:
 1317|  2.64k|                                const uint8_t *above, const uint8_t *left) {
 1318|  2.64k|  (void)above;
 1319|  2.64k|  h_predictor_16xh(dst, stride, left, 4);
 1320|  2.64k|}
aom_h_predictor_32x8_sse2:
 1353|   183k|                               const uint8_t *above, const uint8_t *left) {
 1354|   183k|  __m128i left_col, left_col_8p;
 1355|   183k|  (void)above;
 1356|       |
 1357|   183k|  left_col = _mm_load_si128((const __m128i *)left);
 1358|       |
 1359|   183k|  left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
 1360|   183k|  h_prediction_32x8_1(&left_col_8p, dst, stride);
 1361|   183k|  dst += stride << 2;
 1362|   183k|  h_prediction_32x8_2(&left_col_8p, dst, stride);
 1363|   183k|}
aom_h_predictor_32x16_sse2:
 1367|  58.9k|                                const uint8_t *above, const uint8_t *left) {
 1368|  58.9k|  __m128i left_col, left_col_8p;
 1369|  58.9k|  (void)above;
 1370|       |
 1371|  58.9k|  left_col = _mm_load_si128((const __m128i *)left);
 1372|       |
 1373|  58.9k|  left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
 1374|  58.9k|  h_prediction_32x8_1(&left_col_8p, dst, stride);
 1375|  58.9k|  dst += stride << 2;
 1376|  58.9k|  h_prediction_32x8_2(&left_col_8p, dst, stride);
 1377|  58.9k|  dst += stride << 2;
 1378|       |
 1379|  58.9k|  left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
 1380|  58.9k|  h_prediction_32x8_1(&left_col_8p, dst, stride);
 1381|  58.9k|  dst += stride << 2;
 1382|  58.9k|  h_prediction_32x8_2(&left_col_8p, dst, stride);
 1383|  58.9k|}
aom_h_predictor_32x64_sse2:
 1410|  3.19k|                                const uint8_t *above, const uint8_t *left) {
 1411|  3.19k|  (void)above;
 1412|  3.19k|  h_predictor_32xh(dst, stride, left, 64);
 1413|  3.19k|}
aom_h_predictor_64x64_sse2:
 1448|  18.6k|                                const uint8_t *above, const uint8_t *left) {
 1449|  18.6k|  (void)above;
 1450|  18.6k|  h_predictor_64xh(dst, stride, left, 64);
 1451|  18.6k|}
aom_h_predictor_64x32_sse2:
 1454|  12.3k|                                const uint8_t *above, const uint8_t *left) {
 1455|  12.3k|  (void)above;
 1456|  12.3k|  h_predictor_64xh(dst, stride, left, 32);
 1457|  12.3k|}
aom_h_predictor_64x16_sse2:
 1461|  59.8k|                                const uint8_t *above, const uint8_t *left) {
 1462|  59.8k|  (void)above;
 1463|  59.8k|  h_predictor_64xh(dst, stride, left, 16);
 1464|  59.8k|}
intrapred_sse2.c:dc_sum_8:
   72|  1.76M|static inline __m128i dc_sum_8(const uint8_t *ref) {
   73|  1.76M|  __m128i x = _mm_loadl_epi64((__m128i const *)ref);
   74|  1.76M|  const __m128i zero = _mm_setzero_si128();
   75|  1.76M|  return _mm_sad_epu8(x, zero);
   76|  1.76M|}
intrapred_sse2.c:dc_sum_4:
   65|  1.51M|static inline __m128i dc_sum_4(const uint8_t *ref) {
   66|  1.51M|  __m128i x = _mm_loadl_epi64((__m128i const *)ref);
   67|  1.51M|  const __m128i zero = _mm_setzero_si128();
   68|  1.51M|  x = _mm_unpacklo_epi8(x, zero);
   69|  1.51M|  return _mm_sad_epu8(x, zero);
   70|  1.51M|}
intrapred_sse2.c:divide_using_multiply_shift:
  101|  2.70M|                                              int multiplier) {
  102|  2.70M|  const int interm = num >> shift1;
  103|  2.70M|  return interm * multiplier >> DC_SHIFT2;
  ------------------
  |  |   98|  2.70M|#define DC_SHIFT2 16
  ------------------
  104|  2.70M|}
intrapred_sse2.c:dc_store_4xh:
   17|   513k|                                ptrdiff_t stride) {
   18|  3.35M|  for (int i = 0; i < height; i += 2) {
  ------------------
  |  Branch (18:19): [True: 2.84M, False: 513k]
  ------------------
   19|  2.84M|    *(uint32_t *)dst = dc;
   20|  2.84M|    dst += stride;
   21|  2.84M|    *(uint32_t *)dst = dc;
   22|  2.84M|    dst += stride;
   23|  2.84M|  }
   24|   513k|}
intrapred_sse2.c:dc_store_8xh:
   27|   828k|                                ptrdiff_t stride) {
   28|   828k|  int i;
   29|  10.3M|  for (i = 0; i < height; ++i) {
  ------------------
  |  Branch (29:15): [True: 9.52M, False: 828k]
  ------------------
   30|  9.52M|    _mm_storel_epi64((__m128i *)dst, *row);
   31|  9.52M|    dst += stride;
   32|  9.52M|  }
   33|   828k|}
intrapred_sse2.c:dc_store_16xh:
   36|  1.23M|                                 ptrdiff_t stride) {
   37|  1.23M|  int i;
   38|  13.6M|  for (i = 0; i < height; ++i) {
  ------------------
  |  Branch (38:15): [True: 12.3M, False: 1.23M]
  ------------------
   39|  12.3M|    _mm_store_si128((__m128i *)dst, *row);
   40|  12.3M|    dst += stride;
   41|  12.3M|  }
   42|  1.23M|}
intrapred_sse2.c:dc_sum_64:
   78|  19.0k|static inline __m128i dc_sum_64(const uint8_t *ref) {
   79|  19.0k|  __m128i x0 = _mm_load_si128((__m128i const *)ref);
   80|  19.0k|  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
   81|  19.0k|  __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32));
   82|  19.0k|  __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48));
   83|  19.0k|  const __m128i zero = _mm_setzero_si128();
   84|  19.0k|  x0 = _mm_sad_epu8(x0, zero);
   85|  19.0k|  x1 = _mm_sad_epu8(x1, zero);
   86|  19.0k|  x2 = _mm_sad_epu8(x2, zero);
   87|  19.0k|  x3 = _mm_sad_epu8(x3, zero);
   88|  19.0k|  x0 = _mm_add_epi16(x0, x1);
   89|  19.0k|  x2 = _mm_add_epi16(x2, x3);
   90|  19.0k|  x0 = _mm_add_epi16(x0, x2);
   91|  19.0k|  const __m128i high = _mm_unpackhi_epi64(x0, x0);
   92|  19.0k|  return _mm_add_epi16(x0, high);
   93|  19.0k|}
intrapred_sse2.c:dc_store_32xh:
   45|   386k|                                 ptrdiff_t stride) {
   46|   386k|  int i;
   47|  3.47M|  for (i = 0; i < height; ++i) {
  ------------------
  |  Branch (47:15): [True: 3.08M, False: 386k]
  ------------------
   48|  3.08M|    _mm_store_si128((__m128i *)dst, *row);
   49|  3.08M|    _mm_store_si128((__m128i *)(dst + 16), *row);
   50|  3.08M|    dst += stride;
   51|  3.08M|  }
   52|   386k|}
intrapred_sse2.c:v_predictor_32xh:
  958|  14.1k|                                    const uint8_t *above, int height) {
  959|  14.1k|  const __m128i row0 = _mm_load_si128((__m128i const *)above);
  960|  14.1k|  const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
  961|   127k|  for (int i = 0; i < height; ++i) {
  ------------------
  |  Branch (961:19): [True: 112k, False: 14.1k]
  ------------------
  962|   112k|    _mm_store_si128((__m128i *)dst, row0);
  963|   112k|    _mm_store_si128((__m128i *)(dst + 16), row1);
  964|   112k|    dst += stride;
  965|   112k|  }
  966|  14.1k|}
intrapred_sse2.c:h_predictor_8x16xc:
 1140|  49.7k|                                      int count) {
 1141|  49.7k|  (void)above;
 1142|   109k|  for (int i = 0; i < count; ++i) {
  ------------------
  |  Branch (1142:19): [True: 59.3k, False: 49.7k]
  ------------------
 1143|  59.3k|    const __m128i left_col = _mm_load_si128((__m128i const *)left);
 1144|  59.3k|    __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
 1145|  59.3k|    __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
 1146|       |
 1147|  59.3k|    __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
 1148|  59.3k|    __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
 1149|  59.3k|    __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
 1150|  59.3k|    __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
 1151|  59.3k|    _mm_storel_epi64((__m128i *)dst, row0);
 1152|  59.3k|    dst += stride;
 1153|  59.3k|    _mm_storel_epi64((__m128i *)dst, row1);
 1154|  59.3k|    dst += stride;
 1155|  59.3k|    _mm_storel_epi64((__m128i *)dst, row2);
 1156|  59.3k|    dst += stride;
 1157|  59.3k|    _mm_storel_epi64((__m128i *)dst, row3);
 1158|  59.3k|    dst += stride;
 1159|       |
 1160|  59.3k|    left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
 1161|  59.3k|    row0 = _mm_shufflelo_epi16(left_col_low, 0);
 1162|  59.3k|    row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
 1163|  59.3k|    row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
 1164|  59.3k|    row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
 1165|  59.3k|    _mm_storel_epi64((__m128i *)dst, row0);
 1166|  59.3k|    dst += stride;
 1167|  59.3k|    _mm_storel_epi64((__m128i *)dst, row1);
 1168|  59.3k|    dst += stride;
 1169|  59.3k|    _mm_storel_epi64((__m128i *)dst, row2);
 1170|  59.3k|    dst += stride;
 1171|  59.3k|    _mm_storel_epi64((__m128i *)dst, row3);
 1172|  59.3k|    dst += stride;
 1173|       |
 1174|  59.3k|    row0 = _mm_shufflelo_epi16(left_col_high, 0);
 1175|  59.3k|    row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
 1176|  59.3k|    row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
 1177|  59.3k|    row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
 1178|  59.3k|    _mm_storel_epi64((__m128i *)dst, row0);
 1179|  59.3k|    dst += stride;
 1180|  59.3k|    _mm_storel_epi64((__m128i *)dst, row1);
 1181|  59.3k|    dst += stride;
 1182|  59.3k|    _mm_storel_epi64((__m128i *)dst, row2);
 1183|  59.3k|    dst += stride;
 1184|  59.3k|    _mm_storel_epi64((__m128i *)dst, row3);
 1185|  59.3k|    dst += stride;
 1186|       |
 1187|  59.3k|    left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
 1188|  59.3k|    row0 = _mm_shufflelo_epi16(left_col_high, 0);
 1189|  59.3k|    row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
 1190|  59.3k|    row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
 1191|  59.3k|    row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
 1192|  59.3k|    _mm_storel_epi64((__m128i *)dst, row0);
 1193|  59.3k|    dst += stride;
 1194|  59.3k|    _mm_storel_epi64((__m128i *)dst, row1);
 1195|  59.3k|    dst += stride;
 1196|  59.3k|    _mm_storel_epi64((__m128i *)dst, row2);
 1197|  59.3k|    dst += stride;
 1198|  59.3k|    _mm_storel_epi64((__m128i *)dst, row3);
 1199|  59.3k|    dst += stride;
 1200|  59.3k|    left += 16;
 1201|  59.3k|  }
 1202|  49.7k|}
intrapred_sse2.c:h_prediction_16x8_1:
 1252|   357k|                                       ptrdiff_t stride) {
 1253|   357k|  __m128i row[4];
 1254|   357k|  repeat_low_4pixels(left, row);
 1255|   357k|  h_pred_store_16xh(row, 4, dst, stride);
 1256|   357k|}
intrapred_sse2.c:repeat_low_4pixels:
 1225|   659k|static inline void repeat_low_4pixels(const __m128i *x, __m128i *row) {
 1226|   659k|  const __m128i u0 = _mm_shufflelo_epi16(*x, 0);
 1227|   659k|  const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55);
 1228|   659k|  const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa);
 1229|   659k|  const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff);
 1230|       |
 1231|   659k|  row[0] = _mm_unpacklo_epi64(u0, u0);
 1232|   659k|  row[1] = _mm_unpacklo_epi64(u1, u1);
 1233|   659k|  row[2] = _mm_unpacklo_epi64(u2, u2);
 1234|   659k|  row[3] = _mm_unpacklo_epi64(u3, u3);
 1235|   659k|}
intrapred_sse2.c:h_pred_store_16xh:
 1217|   561k|                                     ptrdiff_t stride) {
 1218|   561k|  int i;
 1219|  2.80M|  for (i = 0; i < h; ++i) {
  ------------------
  |  Branch (1219:15): [True: 2.24M, False: 561k]
  ------------------
 1220|  2.24M|    _mm_store_si128((__m128i *)dst, row[i]);
 1221|  2.24M|    dst += stride;
 1222|  2.24M|  }
 1223|   561k|}
intrapred_sse2.c:h_prediction_16x8_2:
 1261|   203k|                                       ptrdiff_t stride) {
 1262|   203k|  __m128i row[4];
 1263|   203k|  repeat_high_4pixels(left, row);
 1264|   203k|  h_pred_store_16xh(row, 4, dst, stride);
 1265|   203k|}
intrapred_sse2.c:repeat_high_4pixels:
 1237|   505k|static inline void repeat_high_4pixels(const __m128i *x, __m128i *row) {
 1238|   505k|  const __m128i u0 = _mm_shufflehi_epi16(*x, 0);
 1239|   505k|  const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55);
 1240|   505k|  const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa);
 1241|   505k|  const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff);
 1242|       |
 1243|   505k|  row[0] = _mm_unpackhi_epi64(u0, u0);
 1244|   505k|  row[1] = _mm_unpackhi_epi64(u1, u1);
 1245|   505k|  row[2] = _mm_unpackhi_epi64(u2, u2);
 1246|   505k|  row[3] = _mm_unpackhi_epi64(u3, u3);
 1247|   505k|}
intrapred_sse2.c:h_predictor_16xh:
 1288|  25.7k|                                    const uint8_t *left, int count) {
 1289|  25.7k|  int i = 0;
 1290|  56.7k|  do {
 1291|  56.7k|    const __m128i left_col = _mm_load_si128((const __m128i *)left);
 1292|  56.7k|    const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col);
 1293|  56.7k|    h_prediction_16x8_1(&left_col_8p_lo, dst, stride);
 1294|  56.7k|    dst += stride << 2;
 1295|  56.7k|    h_prediction_16x8_2(&left_col_8p_lo, dst, stride);
 1296|  56.7k|    dst += stride << 2;
 1297|       |
 1298|  56.7k|    const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col);
 1299|  56.7k|    h_prediction_16x8_1(&left_col_8p_hi, dst, stride);
 1300|  56.7k|    dst += stride << 2;
 1301|  56.7k|    h_prediction_16x8_2(&left_col_8p_hi, dst, stride);
 1302|  56.7k|    dst += stride << 2;
 1303|       |
 1304|  56.7k|    left += 16;
 1305|  56.7k|    i++;
 1306|  56.7k|  } while (i < count);
  ------------------
  |  Branch (1306:12): [True: 31.0k, False: 25.7k]
  ------------------
 1307|  25.7k|}
intrapred_sse2.c:h_prediction_32x8_1:
 1336|   301k|                                       ptrdiff_t stride) {
 1337|   301k|  __m128i row[4];
 1338|   301k|  repeat_low_4pixels(left, row);
 1339|   301k|  h_pred_store_32xh(row, 4, dst, stride);
 1340|   301k|}
intrapred_sse2.c:h_pred_store_32xh:
 1324|   602k|                                     ptrdiff_t stride) {
 1325|   602k|  int i;
 1326|  3.01M|  for (i = 0; i < h; ++i) {
  ------------------
  |  Branch (1326:15): [True: 2.41M, False: 602k]
  ------------------
 1327|  2.41M|    _mm_store_si128((__m128i *)dst, row[i]);
 1328|  2.41M|    _mm_store_si128((__m128i *)(dst + 16), row[i]);
 1329|  2.41M|    dst += stride;
 1330|  2.41M|  }
 1331|   602k|}
intrapred_sse2.c:h_prediction_32x8_2:
 1345|   301k|                                       ptrdiff_t stride) {
 1346|   301k|  __m128i row[4];
 1347|   301k|  repeat_high_4pixels(left, row);
 1348|   301k|  h_pred_store_32xh(row, 4, dst, stride);
 1349|   301k|}
intrapred_sse2.c:h_predictor_32xh:
 1386|  3.19k|                                    const uint8_t *left, int height) {
 1387|  3.19k|  int i = height >> 2;
 1388|  51.1k|  do {
 1389|  51.1k|    __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
 1390|  51.1k|    left4 = _mm_unpacklo_epi8(left4, left4);
 1391|  51.1k|    left4 = _mm_unpacklo_epi8(left4, left4);
 1392|  51.1k|    const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
 1393|  51.1k|    const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
 1394|  51.1k|    _mm_store_si128((__m128i *)dst, r0);
 1395|  51.1k|    _mm_store_si128((__m128i *)(dst + 16), r0);
 1396|  51.1k|    _mm_store_si128((__m128i *)(dst + stride), r1);
 1397|  51.1k|    _mm_store_si128((__m128i *)(dst + stride + 16), r1);
 1398|  51.1k|    const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
 1399|  51.1k|    const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
 1400|  51.1k|    _mm_store_si128((__m128i *)(dst + stride * 2), r2);
 1401|  51.1k|    _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
 1402|  51.1k|    _mm_store_si128((__m128i *)(dst + stride * 3), r3);
 1403|  51.1k|    _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
 1404|  51.1k|    left += 4;
 1405|  51.1k|    dst += stride * 4;
 1406|  51.1k|  } while (--i);
  ------------------
  |  Branch (1406:12): [True: 47.9k, False: 3.19k]
  ------------------
 1407|  3.19k|}
intrapred_sse2.c:h_predictor_64xh:
 1416|  90.9k|                                    const uint8_t *left, int height) {
 1417|  90.9k|  int i = height >> 2;
 1418|   636k|  do {
 1419|   636k|    __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
 1420|   636k|    left4 = _mm_unpacklo_epi8(left4, left4);
 1421|   636k|    left4 = _mm_unpacklo_epi8(left4, left4);
 1422|   636k|    const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
 1423|   636k|    const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
 1424|   636k|    _mm_store_si128((__m128i *)dst, r0);
 1425|   636k|    _mm_store_si128((__m128i *)(dst + 16), r0);
 1426|   636k|    _mm_store_si128((__m128i *)(dst + 32), r0);
 1427|   636k|    _mm_store_si128((__m128i *)(dst + 48), r0);
 1428|   636k|    _mm_store_si128((__m128i *)(dst + stride), r1);
 1429|   636k|    _mm_store_si128((__m128i *)(dst + stride + 16), r1);
 1430|   636k|    _mm_store_si128((__m128i *)(dst + stride + 32), r1);
 1431|   636k|    _mm_store_si128((__m128i *)(dst + stride + 48), r1);
 1432|   636k|    const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
 1433|   636k|    const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
 1434|   636k|    _mm_store_si128((__m128i *)(dst + stride * 2), r2);
 1435|   636k|    _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
 1436|   636k|    _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2);
 1437|   636k|    _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2);
 1438|   636k|    _mm_store_si128((__m128i *)(dst + stride * 3), r3);
 1439|   636k|    _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
 1440|   636k|    _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3);
 1441|   636k|    _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3);
 1442|   636k|    left += 4;
 1443|   636k|    dst += stride * 4;
 1444|   636k|  } while (--i);
  ------------------
  |  Branch (1444:12): [True: 546k, False: 90.9k]
  ------------------
 1445|  90.9k|}

aom_paeth_predictor_4x4_ssse3:
   45|   268k|                                   const uint8_t *above, const uint8_t *left) {
   46|   268k|  __m128i l = _mm_loadl_epi64((const __m128i *)left);
   47|   268k|  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
   48|   268k|  const __m128i zero = _mm_setzero_si128();
   49|   268k|  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
   50|   268k|  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   51|   268k|  __m128i rep = _mm_set1_epi16((short)0x8000);
   52|   268k|  const __m128i one = _mm_set1_epi16(1);
   53|       |
   54|   268k|  int i;
   55|  1.34M|  for (i = 0; i < 4; ++i) {
  ------------------
  |  Branch (55:15): [True: 1.07M, False: 268k]
  ------------------
   56|  1.07M|    const __m128i l16 = _mm_shuffle_epi8(l, rep);
   57|  1.07M|    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
   58|       |
   59|  1.07M|    *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
   60|  1.07M|    dst += stride;
   61|  1.07M|    rep = _mm_add_epi16(rep, one);
   62|  1.07M|  }
   63|   268k|}
aom_paeth_predictor_4x8_ssse3:
   66|  58.3k|                                   const uint8_t *above, const uint8_t *left) {
   67|  58.3k|  __m128i l = _mm_loadl_epi64((const __m128i *)left);
   68|  58.3k|  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
   69|  58.3k|  const __m128i zero = _mm_setzero_si128();
   70|  58.3k|  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
   71|  58.3k|  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   72|  58.3k|  __m128i rep = _mm_set1_epi16((short)0x8000);
   73|  58.3k|  const __m128i one = _mm_set1_epi16(1);
   74|       |
   75|  58.3k|  int i;
   76|   525k|  for (i = 0; i < 8; ++i) {
  ------------------
  |  Branch (76:15): [True: 467k, False: 58.3k]
  ------------------
   77|   467k|    const __m128i l16 = _mm_shuffle_epi8(l, rep);
   78|   467k|    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
   79|       |
   80|   467k|    *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
   81|   467k|    dst += stride;
   82|   467k|    rep = _mm_add_epi16(rep, one);
   83|   467k|  }
   84|  58.3k|}
aom_paeth_predictor_4x16_ssse3:
   88|  62.1k|                                    const uint8_t *above, const uint8_t *left) {
   89|  62.1k|  __m128i l = _mm_load_si128((const __m128i *)left);
   90|  62.1k|  const __m128i t = _mm_cvtsi32_si128(((const int *)above)[0]);
   91|  62.1k|  const __m128i zero = _mm_setzero_si128();
   92|  62.1k|  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
   93|  62.1k|  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
   94|  62.1k|  __m128i rep = _mm_set1_epi16((short)0x8000);
   95|  62.1k|  const __m128i one = _mm_set1_epi16(1);
   96|       |
   97|  1.05M|  for (int i = 0; i < 16; ++i) {
  ------------------
  |  Branch (97:19): [True: 993k, False: 62.1k]
  ------------------
   98|   993k|    const __m128i l16 = _mm_shuffle_epi8(l, rep);
   99|   993k|    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
  100|       |
  101|   993k|    *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
  102|   993k|    dst += stride;
  103|   993k|    rep = _mm_add_epi16(rep, one);
  104|   993k|  }
  105|  62.1k|}
aom_paeth_predictor_8x4_ssse3:
  109|  85.2k|                                   const uint8_t *above, const uint8_t *left) {
  110|  85.2k|  __m128i l = _mm_loadl_epi64((const __m128i *)left);
  111|  85.2k|  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
  112|  85.2k|  const __m128i zero = _mm_setzero_si128();
  113|  85.2k|  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
  114|  85.2k|  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
  115|  85.2k|  __m128i rep = _mm_set1_epi16((short)0x8000);
  116|  85.2k|  const __m128i one = _mm_set1_epi16(1);
  117|       |
  118|  85.2k|  int i;
  119|   426k|  for (i = 0; i < 4; ++i) {
  ------------------
  |  Branch (119:15): [True: 341k, False: 85.2k]
  ------------------
  120|   341k|    const __m128i l16 = _mm_shuffle_epi8(l, rep);
  121|   341k|    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
  122|       |
  123|   341k|    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
  124|   341k|    dst += stride;
  125|   341k|    rep = _mm_add_epi16(rep, one);
  126|   341k|  }
  127|  85.2k|}
aom_paeth_predictor_8x8_ssse3:
  130|   176k|                                   const uint8_t *above, const uint8_t *left) {
  131|   176k|  __m128i l = _mm_loadl_epi64((const __m128i *)left);
  132|   176k|  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
  133|   176k|  const __m128i zero = _mm_setzero_si128();
  134|   176k|  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
  135|   176k|  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
  136|   176k|  __m128i rep = _mm_set1_epi16((short)0x8000);
  137|   176k|  const __m128i one = _mm_set1_epi16(1);
  138|       |
  139|   176k|  int i;
  140|  1.58M|  for (i = 0; i < 8; ++i) {
  ------------------
  |  Branch (140:15): [True: 1.41M, False: 176k]
  ------------------
  141|  1.41M|    const __m128i l16 = _mm_shuffle_epi8(l, rep);
  142|  1.41M|    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
  143|       |
  144|  1.41M|    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
  145|  1.41M|    dst += stride;
  146|  1.41M|    rep = _mm_add_epi16(rep, one);
  147|  1.41M|  }
  148|   176k|}
aom_paeth_predictor_8x16_ssse3:
  151|  47.2k|                                    const uint8_t *above, const uint8_t *left) {
  152|  47.2k|  __m128i l = _mm_load_si128((const __m128i *)left);
  153|  47.2k|  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
  154|  47.2k|  const __m128i zero = _mm_setzero_si128();
  155|  47.2k|  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
  156|  47.2k|  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
  157|  47.2k|  __m128i rep = _mm_set1_epi16((short)0x8000);
  158|  47.2k|  const __m128i one = _mm_set1_epi16(1);
  159|       |
  160|  47.2k|  int i;
  161|   802k|  for (i = 0; i < 16; ++i) {
  ------------------
  |  Branch (161:15): [True: 755k, False: 47.2k]
  ------------------
  162|   755k|    const __m128i l16 = _mm_shuffle_epi8(l, rep);
  163|   755k|    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
  164|       |
  165|   755k|    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
  166|   755k|    dst += stride;
  167|   755k|    rep = _mm_add_epi16(rep, one);
  168|   755k|  }
  169|  47.2k|}
aom_paeth_predictor_8x32_ssse3:
  173|  31.2k|                                    const uint8_t *above, const uint8_t *left) {
  174|  31.2k|  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
  175|  31.2k|  const __m128i zero = _mm_setzero_si128();
  176|  31.2k|  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
  177|  31.2k|  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
  178|  31.2k|  const __m128i one = _mm_set1_epi16(1);
  179|       |
  180|  93.8k|  for (int j = 0; j < 2; ++j) {
  ------------------
  |  Branch (180:19): [True: 62.5k, False: 31.2k]
  ------------------
  181|  62.5k|    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
  182|  62.5k|    __m128i rep = _mm_set1_epi16((short)0x8000);
  183|  1.06M|    for (int i = 0; i < 16; ++i) {
  ------------------
  |  Branch (183:21): [True: 1.00M, False: 62.5k]
  ------------------
  184|  1.00M|      const __m128i l16 = _mm_shuffle_epi8(l, rep);
  185|  1.00M|      const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
  186|       |
  187|  1.00M|      _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
  188|  1.00M|      dst += stride;
  189|  1.00M|      rep = _mm_add_epi16(rep, one);
  190|  1.00M|    }
  191|  62.5k|  }
  192|  31.2k|}
aom_paeth_predictor_16x4_ssse3:
  206|  67.5k|                                    const uint8_t *above, const uint8_t *left) {
  207|  67.5k|  __m128i l = _mm_cvtsi32_si128(((const int *)left)[0]);
  208|  67.5k|  const __m128i t = _mm_load_si128((const __m128i *)above);
  209|  67.5k|  const __m128i zero = _mm_setzero_si128();
  210|  67.5k|  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
  211|  67.5k|  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
  212|  67.5k|  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
  213|  67.5k|  __m128i rep = _mm_set1_epi16((short)0x8000);
  214|  67.5k|  const __m128i one = _mm_set1_epi16(1);
  215|       |
  216|   337k|  for (int i = 0; i < 4; ++i) {
  ------------------
  |  Branch (216:19): [True: 270k, False: 67.5k]
  ------------------
  217|   270k|    const __m128i l16 = _mm_shuffle_epi8(l, rep);
  218|   270k|    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
  219|       |
  220|   270k|    _mm_store_si128((__m128i *)dst, row);
  221|   270k|    dst += stride;
  222|   270k|    rep = _mm_add_epi16(rep, one);
  223|   270k|  }
  224|  67.5k|}
aom_paeth_predictor_32x8_ssse3:
  332|  29.6k|                                    const uint8_t *above, const uint8_t *left) {
  333|  29.6k|  const __m128i a = _mm_load_si128((const __m128i *)above);
  334|  29.6k|  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
  335|  29.6k|  const __m128i zero = _mm_setzero_si128();
  336|  29.6k|  const __m128i al = _mm_unpacklo_epi8(a, zero);
  337|  29.6k|  const __m128i ah = _mm_unpackhi_epi8(a, zero);
  338|  29.6k|  const __m128i bl = _mm_unpacklo_epi8(b, zero);
  339|  29.6k|  const __m128i bh = _mm_unpackhi_epi8(b, zero);
  340|       |
  341|  29.6k|  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
  342|  29.6k|  __m128i rep = _mm_set1_epi16((short)0x8000);
  343|  29.6k|  const __m128i one = _mm_set1_epi16(1);
  344|  29.6k|  const __m128i l = _mm_loadl_epi64((const __m128i *)left);
  345|  29.6k|  __m128i l16;
  346|       |
  347|   267k|  for (int i = 0; i < 8; ++i) {
  ------------------
  |  Branch (347:19): [True: 237k, False: 29.6k]
  ------------------
  348|   237k|    l16 = _mm_shuffle_epi8(l, rep);
  349|   237k|    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
  350|   237k|    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
  351|       |
  352|   237k|    _mm_store_si128((__m128i *)dst, r32l);
  353|   237k|    _mm_store_si128((__m128i *)(dst + 16), r32h);
  354|   237k|    dst += stride;
  355|   237k|    rep = _mm_add_epi16(rep, one);
  356|   237k|  }
  357|  29.6k|}
aom_smooth_predictor_4x4_ssse3:
  678|   201k|                                    const uint8_t *above, const uint8_t *left) {
  679|   201k|  __m128i pixels[3];
  680|   201k|  load_pixel_w4(above, left, 4, pixels);
  681|       |
  682|   201k|  __m128i wh[4], ww[2];
  683|   201k|  load_weight_w4(4, wh, ww);
  684|       |
  685|   201k|  smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
  686|   201k|}
aom_smooth_predictor_4x8_ssse3:
  689|  62.6k|                                    const uint8_t *above, const uint8_t *left) {
  690|  62.6k|  __m128i pixels[3];
  691|  62.6k|  load_pixel_w4(above, left, 8, pixels);
  692|       |
  693|  62.6k|  __m128i wh[4], ww[2];
  694|  62.6k|  load_weight_w4(8, wh, ww);
  695|       |
  696|  62.6k|  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
  697|  62.6k|}
aom_smooth_predictor_4x16_ssse3:
  702|  32.4k|                                     const uint8_t *left) {
  703|  32.4k|  __m128i pixels[3];
  704|  32.4k|  load_pixel_w4(above, left, 16, pixels);
  705|       |
  706|  32.4k|  __m128i wh[4], ww[2];
  707|  32.4k|  load_weight_w4(16, wh, ww);
  708|       |
  709|  32.4k|  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
  710|  32.4k|  dst += stride << 3;
  711|  32.4k|  smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
  712|  32.4k|}
aom_smooth_predictor_8x4_ssse3:
  845|  91.3k|                                    const uint8_t *above, const uint8_t *left) {
  846|  91.3k|  __m128i pixels[4];
  847|  91.3k|  load_pixel_w8(above, left, 4, pixels);
  848|       |
  849|  91.3k|  __m128i wh[4], ww[2];
  850|  91.3k|  load_weight_w8(4, wh, ww);
  851|       |
  852|  91.3k|  smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
  853|  91.3k|}
aom_smooth_predictor_8x8_ssse3:
  856|   186k|                                    const uint8_t *above, const uint8_t *left) {
  857|   186k|  __m128i pixels[4];
  858|   186k|  load_pixel_w8(above, left, 8, pixels);
  859|       |
  860|   186k|  __m128i wh[4], ww[2];
  861|   186k|  load_weight_w8(8, wh, ww);
  862|       |
  863|   186k|  smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
  864|   186k|}
aom_smooth_predictor_8x16_ssse3:
  868|  50.7k|                                     const uint8_t *left) {
  869|  50.7k|  __m128i pixels[4];
  870|  50.7k|  load_pixel_w8(above, left, 16, pixels);
  871|       |
  872|  50.7k|  __m128i wh[4], ww[2];
  873|  50.7k|  load_weight_w8(16, wh, ww);
  874|       |
  875|  50.7k|  smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
  876|  50.7k|  dst += stride << 3;
  877|  50.7k|  smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
  878|  50.7k|}
aom_smooth_predictor_8x32_ssse3:
  883|  18.6k|                                     const uint8_t *left) {
  884|  18.6k|  __m128i pixels[8];
  885|  18.6k|  load_pixel_w8(above, left, 32, pixels);
  886|       |
  887|  18.6k|  __m128i wh[8], ww[2];
  888|  18.6k|  load_weight_w8(32, wh, ww);
  889|       |
  890|  18.6k|  smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
  891|  18.6k|  dst += stride << 3;
  892|  18.6k|  smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
  893|  18.6k|  dst += stride << 3;
  894|  18.6k|  smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
  895|  18.6k|  dst += stride << 3;
  896|  18.6k|  smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
  897|  18.6k|}
aom_smooth_predictor_16x4_ssse3:
 1018|  62.4k|                                     const uint8_t *left) {
 1019|  62.4k|  smooth_predictor_wxh(dst, stride, above, left, 16, 4);
 1020|  62.4k|}
aom_smooth_predictor_16x8_ssse3:
 1025|  66.8k|                                     const uint8_t *left) {
 1026|  66.8k|  smooth_predictor_wxh(dst, stride, above, left, 16, 8);
 1027|  66.8k|}
aom_smooth_predictor_16x16_ssse3:
 1031|   110k|                                      const uint8_t *left) {
 1032|   110k|  smooth_predictor_wxh(dst, stride, above, left, 16, 16);
 1033|   110k|}
aom_smooth_predictor_16x32_ssse3:
 1037|  28.1k|                                      const uint8_t *left) {
 1038|  28.1k|  smooth_predictor_wxh(dst, stride, above, left, 16, 32);
 1039|  28.1k|}
aom_smooth_predictor_16x64_ssse3:
 1044|  5.19k|                                      const uint8_t *left) {
 1045|  5.19k|  smooth_predictor_wxh(dst, stride, above, left, 16, 64);
 1046|  5.19k|}
aom_smooth_predictor_32x8_ssse3:
 1050|  43.9k|                                     const uint8_t *left) {
 1051|  43.9k|  smooth_predictor_wxh(dst, stride, above, left, 32, 8);
 1052|  43.9k|}
aom_smooth_predictor_32x16_ssse3:
 1057|  28.1k|                                      const uint8_t *left) {
 1058|  28.1k|  smooth_predictor_wxh(dst, stride, above, left, 32, 16);
 1059|  28.1k|}
aom_smooth_predictor_32x32_ssse3:
 1063|  97.1k|                                      const uint8_t *left) {
 1064|  97.1k|  smooth_predictor_wxh(dst, stride, above, left, 32, 32);
 1065|  97.1k|}
aom_smooth_predictor_32x64_ssse3:
 1069|  2.88k|                                      const uint8_t *left) {
 1070|  2.88k|  smooth_predictor_wxh(dst, stride, above, left, 32, 64);
 1071|  2.88k|}
aom_smooth_predictor_64x16_ssse3:
 1076|  25.1k|                                      const uint8_t *left) {
 1077|  25.1k|  smooth_predictor_wxh(dst, stride, above, left, 64, 16);
 1078|  25.1k|}
aom_smooth_predictor_64x32_ssse3:
 1083|  5.39k|                                      const uint8_t *left) {
 1084|  5.39k|  smooth_predictor_wxh(dst, stride, above, left, 64, 32);
 1085|  5.39k|}
aom_smooth_predictor_64x64_ssse3:
 1089|  29.7k|                                      const uint8_t *left) {
 1090|  29.7k|  smooth_predictor_wxh(dst, stride, above, left, 64, 64);
 1091|  29.7k|}
aom_smooth_v_predictor_4x4_ssse3:
 1198|  39.3k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1199|  39.3k|  __m128i pixels;
 1200|  39.3k|  load_smooth_vertical_pixels4(top_row, left_column, 4, &pixels);
 1201|       |
 1202|  39.3k|  __m128i weights[2];
 1203|  39.3k|  load_smooth_vertical_weights4(smooth_weights, 4, weights);
 1204|       |
 1205|  39.3k|  write_smooth_vertical4xh(&pixels, weights, 4, dst, stride);
 1206|  39.3k|}
aom_smooth_v_predictor_4x8_ssse3:
 1211|  13.4k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1212|  13.4k|  __m128i pixels;
 1213|  13.4k|  load_smooth_vertical_pixels4(top_row, left_column, 8, &pixels);
 1214|       |
 1215|  13.4k|  __m128i weights[2];
 1216|  13.4k|  load_smooth_vertical_weights4(smooth_weights, 8, weights);
 1217|       |
 1218|  13.4k|  write_smooth_vertical4xh(&pixels, weights, 8, dst, stride);
 1219|  13.4k|}
aom_smooth_v_predictor_4x16_ssse3:
 1225|  9.59k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1226|  9.59k|  __m128i pixels;
 1227|  9.59k|  load_smooth_vertical_pixels4(top_row, left_column, 16, &pixels);
 1228|       |
 1229|  9.59k|  __m128i weights[4];
 1230|  9.59k|  load_smooth_vertical_weights4(smooth_weights, 16, weights);
 1231|       |
 1232|  9.59k|  write_smooth_vertical4xh(&pixels, weights, 8, dst, stride);
 1233|  9.59k|  dst += stride << 3;
 1234|  9.59k|  write_smooth_vertical4xh(&pixels, &weights[2], 8, dst, stride);
 1235|  9.59k|}
aom_smooth_v_predictor_8x4_ssse3:
 1241|  19.6k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1242|  19.6k|  const __m128i bottom_left = _mm_set1_epi16(left_column[3]);
 1243|  19.6k|  const __m128i weights = cvtepu8_epi16(Load4(smooth_weights));
 1244|  19.6k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  19.6k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1245|  19.6k|  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
 1246|  19.6k|  const __m128i scaled_bottom_left =
 1247|  19.6k|      _mm_mullo_epi16(inverted_weights, bottom_left);
 1248|  19.6k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  19.6k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1249|  19.6k|  __m128i y_select = _mm_set1_epi32(0x01000100);
 1250|  19.6k|  const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
 1251|  19.6k|  __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
 1252|  19.6k|  __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
 1253|  19.6k|  write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
 1254|  19.6k|                                &round);
 1255|  19.6k|  dst += stride;
 1256|  19.6k|  y_select = _mm_set1_epi32(0x03020302);
 1257|  19.6k|  weights_y = _mm_shuffle_epi8(weights, y_select);
 1258|  19.6k|  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
 1259|  19.6k|  write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
 1260|  19.6k|                                &round);
 1261|  19.6k|  dst += stride;
 1262|  19.6k|  y_select = _mm_set1_epi32(0x05040504);
 1263|  19.6k|  weights_y = _mm_shuffle_epi8(weights, y_select);
 1264|  19.6k|  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
 1265|  19.6k|  write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
 1266|  19.6k|                                &round);
 1267|  19.6k|  dst += stride;
 1268|  19.6k|  y_select = _mm_set1_epi32(0x07060706);
 1269|  19.6k|  weights_y = _mm_shuffle_epi8(weights, y_select);
 1270|  19.6k|  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
 1271|  19.6k|  write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
 1272|  19.6k|                                &round);
 1273|  19.6k|}
aom_smooth_v_predictor_8x8_ssse3:
 1278|  42.5k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1279|  42.5k|  const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
 1280|  42.5k|  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
 1281|  42.5k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  42.5k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1282|  42.5k|  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
 1283|  42.5k|  const __m128i scaled_bottom_left =
 1284|  42.5k|      _mm_mullo_epi16(inverted_weights, bottom_left);
 1285|  42.5k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  42.5k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1286|  42.5k|  const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
 1287|   383k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1287:33): [True: 340k, False: 42.5k]
  ------------------
 1288|   340k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1289|   340k|    const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
 1290|   340k|    const __m128i scaled_bottom_left_y =
 1291|   340k|        _mm_shuffle_epi8(scaled_bottom_left, y_select);
 1292|   340k|    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
 1293|   340k|                                  &round);
 1294|   340k|    dst += stride;
 1295|   340k|  }
 1296|  42.5k|}
aom_smooth_v_predictor_8x16_ssse3:
 1301|  13.2k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1302|  13.2k|  const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
 1303|  13.2k|  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
 1304|       |
 1305|  13.2k|  const __m128i weights1 = cvtepu8_epi16(weights);
 1306|  13.2k|  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
 1307|  13.2k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  13.2k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1308|  13.2k|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 1309|  13.2k|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 1310|  13.2k|  const __m128i scaled_bottom_left1 =
 1311|  13.2k|      _mm_mullo_epi16(inverted_weights1, bottom_left);
 1312|  13.2k|  const __m128i scaled_bottom_left2 =
 1313|  13.2k|      _mm_mullo_epi16(inverted_weights2, bottom_left);
 1314|  13.2k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  13.2k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1315|  13.2k|  const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
 1316|   119k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1316:33): [True: 106k, False: 13.2k]
  ------------------
 1317|   106k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1318|   106k|    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
 1319|   106k|    const __m128i scaled_bottom_left_y =
 1320|   106k|        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
 1321|   106k|    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
 1322|   106k|                                  &round);
 1323|   106k|    dst += stride;
 1324|   106k|  }
 1325|   119k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1325:33): [True: 106k, False: 13.2k]
  ------------------
 1326|   106k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1327|   106k|    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
 1328|   106k|    const __m128i scaled_bottom_left_y =
 1329|   106k|        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
 1330|   106k|    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
 1331|   106k|                                  &round);
 1332|   106k|    dst += stride;
 1333|   106k|  }
 1334|  13.2k|}
aom_smooth_v_predictor_8x32_ssse3:
 1340|  4.73k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1341|  4.73k|  const __m128i zero = _mm_setzero_si128();
 1342|  4.73k|  const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
 1343|  4.73k|  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
 1344|  4.73k|  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
 1345|  4.73k|  const __m128i weights1 = cvtepu8_epi16(weights_lo);
 1346|  4.73k|  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
 1347|  4.73k|  const __m128i weights3 = cvtepu8_epi16(weights_hi);
 1348|  4.73k|  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
 1349|  4.73k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  4.73k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1350|  4.73k|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 1351|  4.73k|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 1352|  4.73k|  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
 1353|  4.73k|  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
 1354|  4.73k|  const __m128i scaled_bottom_left1 =
 1355|  4.73k|      _mm_mullo_epi16(inverted_weights1, bottom_left);
 1356|  4.73k|  const __m128i scaled_bottom_left2 =
 1357|  4.73k|      _mm_mullo_epi16(inverted_weights2, bottom_left);
 1358|  4.73k|  const __m128i scaled_bottom_left3 =
 1359|  4.73k|      _mm_mullo_epi16(inverted_weights3, bottom_left);
 1360|  4.73k|  const __m128i scaled_bottom_left4 =
 1361|  4.73k|      _mm_mullo_epi16(inverted_weights4, bottom_left);
 1362|  4.73k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  4.73k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1363|  4.73k|  const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
 1364|  42.6k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1364:33): [True: 37.8k, False: 4.73k]
  ------------------
 1365|  37.8k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1366|  37.8k|    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
 1367|  37.8k|    const __m128i scaled_bottom_left_y =
 1368|  37.8k|        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
 1369|  37.8k|    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
 1370|  37.8k|                                  &round);
 1371|  37.8k|    dst += stride;
 1372|  37.8k|  }
 1373|  42.6k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1373:33): [True: 37.8k, False: 4.73k]
  ------------------
 1374|  37.8k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1375|  37.8k|    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
 1376|  37.8k|    const __m128i scaled_bottom_left_y =
 1377|  37.8k|        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
 1378|  37.8k|    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
 1379|  37.8k|                                  &round);
 1380|  37.8k|    dst += stride;
 1381|  37.8k|  }
 1382|  42.6k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1382:33): [True: 37.8k, False: 4.73k]
  ------------------
 1383|  37.8k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1384|  37.8k|    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
 1385|  37.8k|    const __m128i scaled_bottom_left_y =
 1386|  37.8k|        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
 1387|  37.8k|    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
 1388|  37.8k|                                  &round);
 1389|  37.8k|    dst += stride;
 1390|  37.8k|  }
 1391|  42.6k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1391:33): [True: 37.8k, False: 4.73k]
  ------------------
 1392|  37.8k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1393|  37.8k|    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
 1394|  37.8k|    const __m128i scaled_bottom_left_y =
 1395|  37.8k|        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
 1396|  37.8k|    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
 1397|  37.8k|                                  &round);
 1398|  37.8k|    dst += stride;
 1399|  37.8k|  }
 1400|  4.73k|}
aom_smooth_v_predictor_16x4_ssse3:
 1405|  18.3k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1406|  18.3k|  const __m128i bottom_left = _mm_set1_epi16(left_column[3]);
 1407|  18.3k|  const __m128i weights = cvtepu8_epi16(Load4(smooth_weights));
 1408|  18.3k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  18.3k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1409|  18.3k|  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
 1410|  18.3k|  const __m128i scaled_bottom_left =
 1411|  18.3k|      _mm_mullo_epi16(inverted_weights, bottom_left);
 1412|  18.3k|  const __m128i round = _mm_set1_epi16(128);
 1413|  18.3k|  const __m128i top = LoadUnaligned16(top_row);
 1414|  18.3k|  const __m128i top_lo = cvtepu8_epi16(top);
 1415|  18.3k|  const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8));
 1416|       |
 1417|  18.3k|  __m128i y_select = _mm_set1_epi32(0x01000100);
 1418|  18.3k|  __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
 1419|  18.3k|  __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
 1420|  18.3k|  write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
 1421|  18.3k|                                 scaled_bottom_left_y, scaled_bottom_left_y,
 1422|  18.3k|                                 round);
 1423|  18.3k|  dst += stride;
 1424|  18.3k|  y_select = _mm_set1_epi32(0x03020302);
 1425|  18.3k|  weights_y = _mm_shuffle_epi8(weights, y_select);
 1426|  18.3k|  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
 1427|  18.3k|  write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
 1428|  18.3k|                                 scaled_bottom_left_y, scaled_bottom_left_y,
 1429|  18.3k|                                 round);
 1430|  18.3k|  dst += stride;
 1431|  18.3k|  y_select = _mm_set1_epi32(0x05040504);
 1432|  18.3k|  weights_y = _mm_shuffle_epi8(weights, y_select);
 1433|  18.3k|  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
 1434|  18.3k|  write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
 1435|  18.3k|                                 scaled_bottom_left_y, scaled_bottom_left_y,
 1436|  18.3k|                                 round);
 1437|  18.3k|  dst += stride;
 1438|  18.3k|  y_select = _mm_set1_epi32(0x07060706);
 1439|  18.3k|  weights_y = _mm_shuffle_epi8(weights, y_select);
 1440|  18.3k|  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
 1441|  18.3k|  write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
 1442|  18.3k|                                 scaled_bottom_left_y, scaled_bottom_left_y,
 1443|  18.3k|                                 round);
 1444|  18.3k|}
aom_smooth_v_predictor_16x8_ssse3:
 1450|  14.0k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1451|  14.0k|  const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
 1452|  14.0k|  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
 1453|  14.0k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  14.0k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1454|  14.0k|  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
 1455|  14.0k|  const __m128i scaled_bottom_left =
 1456|  14.0k|      _mm_mullo_epi16(inverted_weights, bottom_left);
 1457|  14.0k|  const __m128i round = _mm_set1_epi16(128);
 1458|  14.0k|  const __m128i top = LoadUnaligned16(top_row);
 1459|  14.0k|  const __m128i top_lo = cvtepu8_epi16(top);
 1460|  14.0k|  const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8));
 1461|   126k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1461:33): [True: 112k, False: 14.0k]
  ------------------
 1462|   112k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1463|   112k|    const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
 1464|   112k|    const __m128i scaled_bottom_left_y =
 1465|   112k|        _mm_shuffle_epi8(scaled_bottom_left, y_select);
 1466|   112k|    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
 1467|   112k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1468|   112k|                                   round);
 1469|   112k|    dst += stride;
 1470|   112k|  }
 1471|  14.0k|}
aom_smooth_v_predictor_16x16_ssse3:
 1476|  33.2k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1477|  33.2k|  const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
 1478|  33.2k|  const __m128i zero = _mm_setzero_si128();
 1479|  33.2k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  33.2k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1480|  33.2k|  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
 1481|  33.2k|  const __m128i weights_lo = cvtepu8_epi16(weights);
 1482|  33.2k|  const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
 1483|  33.2k|  const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
 1484|  33.2k|  const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
 1485|  33.2k|  const __m128i scaled_bottom_left_lo =
 1486|  33.2k|      _mm_mullo_epi16(inverted_weights_lo, bottom_left);
 1487|  33.2k|  const __m128i scaled_bottom_left_hi =
 1488|  33.2k|      _mm_mullo_epi16(inverted_weights_hi, bottom_left);
 1489|  33.2k|  const __m128i round = _mm_set1_epi16(128);
 1490|       |
 1491|  33.2k|  const __m128i top = LoadUnaligned16(top_row);
 1492|  33.2k|  const __m128i top_lo = cvtepu8_epi16(top);
 1493|  33.2k|  const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
 1494|   299k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1494:33): [True: 266k, False: 33.2k]
  ------------------
 1495|   266k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1496|   266k|    const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
 1497|   266k|    const __m128i scaled_bottom_left_y =
 1498|   266k|        _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
 1499|   266k|    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
 1500|   266k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1501|   266k|                                   round);
 1502|   266k|    dst += stride;
 1503|   266k|  }
 1504|   299k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1504:33): [True: 266k, False: 33.2k]
  ------------------
 1505|   266k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1506|   266k|    const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
 1507|   266k|    const __m128i scaled_bottom_left_y =
 1508|   266k|        _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
 1509|   266k|    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
 1510|   266k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1511|   266k|                                   round);
 1512|   266k|    dst += stride;
 1513|   266k|  }
 1514|  33.2k|}
aom_smooth_v_predictor_16x32_ssse3:
 1519|  9.09k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1520|  9.09k|  const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
 1521|  9.09k|  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
 1522|  9.09k|  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
 1523|  9.09k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  9.09k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1524|  9.09k|  const __m128i zero = _mm_setzero_si128();
 1525|  9.09k|  const __m128i weights1 = cvtepu8_epi16(weights_lo);
 1526|  9.09k|  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
 1527|  9.09k|  const __m128i weights3 = cvtepu8_epi16(weights_hi);
 1528|  9.09k|  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
 1529|  9.09k|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 1530|  9.09k|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 1531|  9.09k|  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
 1532|  9.09k|  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
 1533|  9.09k|  const __m128i scaled_bottom_left1 =
 1534|  9.09k|      _mm_mullo_epi16(inverted_weights1, bottom_left);
 1535|  9.09k|  const __m128i scaled_bottom_left2 =
 1536|  9.09k|      _mm_mullo_epi16(inverted_weights2, bottom_left);
 1537|  9.09k|  const __m128i scaled_bottom_left3 =
 1538|  9.09k|      _mm_mullo_epi16(inverted_weights3, bottom_left);
 1539|  9.09k|  const __m128i scaled_bottom_left4 =
 1540|  9.09k|      _mm_mullo_epi16(inverted_weights4, bottom_left);
 1541|  9.09k|  const __m128i round = _mm_set1_epi16(128);
 1542|       |
 1543|  9.09k|  const __m128i top = LoadUnaligned16(top_row);
 1544|  9.09k|  const __m128i top_lo = cvtepu8_epi16(top);
 1545|  9.09k|  const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
 1546|  81.8k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1546:33): [True: 72.7k, False: 9.09k]
  ------------------
 1547|  72.7k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1548|  72.7k|    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
 1549|  72.7k|    const __m128i scaled_bottom_left_y =
 1550|  72.7k|        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
 1551|  72.7k|    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
 1552|  72.7k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1553|  72.7k|                                   round);
 1554|  72.7k|    dst += stride;
 1555|  72.7k|  }
 1556|  81.8k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1556:33): [True: 72.7k, False: 9.09k]
  ------------------
 1557|  72.7k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1558|  72.7k|    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
 1559|  72.7k|    const __m128i scaled_bottom_left_y =
 1560|  72.7k|        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
 1561|  72.7k|    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
 1562|  72.7k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1563|  72.7k|                                   round);
 1564|  72.7k|    dst += stride;
 1565|  72.7k|  }
 1566|  81.8k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1566:33): [True: 72.7k, False: 9.09k]
  ------------------
 1567|  72.7k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1568|  72.7k|    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
 1569|  72.7k|    const __m128i scaled_bottom_left_y =
 1570|  72.7k|        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
 1571|  72.7k|    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
 1572|  72.7k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1573|  72.7k|                                   round);
 1574|  72.7k|    dst += stride;
 1575|  72.7k|  }
 1576|  81.8k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1576:33): [True: 72.7k, False: 9.09k]
  ------------------
 1577|  72.7k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1578|  72.7k|    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
 1579|  72.7k|    const __m128i scaled_bottom_left_y =
 1580|  72.7k|        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
 1581|  72.7k|    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
 1582|  72.7k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1583|  72.7k|                                   round);
 1584|  72.7k|    dst += stride;
 1585|  72.7k|  }
 1586|  9.09k|}
aom_smooth_v_predictor_16x64_ssse3:
 1592|  1.77k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1593|  1.77k|  const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
 1594|  1.77k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  1.77k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1595|  1.77k|  const __m128i round = _mm_set1_epi16(128);
 1596|  1.77k|  const __m128i zero = _mm_setzero_si128();
 1597|  1.77k|  const __m128i top = LoadUnaligned16(top_row);
 1598|  1.77k|  const __m128i top_lo = cvtepu8_epi16(top);
 1599|  1.77k|  const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
 1600|  1.77k|  const uint8_t *weights_base_ptr = smooth_weights + 60;
 1601|  8.86k|  for (int left_offset = 0; left_offset < 64; left_offset += 16) {
  ------------------
  |  Branch (1601:29): [True: 7.09k, False: 1.77k]
  ------------------
 1602|  7.09k|    const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
 1603|  7.09k|    const __m128i weights_lo = cvtepu8_epi16(weights);
 1604|  7.09k|    const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
 1605|  7.09k|    const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
 1606|  7.09k|    const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
 1607|  7.09k|    const __m128i scaled_bottom_left_lo =
 1608|  7.09k|        _mm_mullo_epi16(inverted_weights_lo, bottom_left);
 1609|  7.09k|    const __m128i scaled_bottom_left_hi =
 1610|  7.09k|        _mm_mullo_epi16(inverted_weights_hi, bottom_left);
 1611|       |
 1612|  63.8k|    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1612:35): [True: 56.7k, False: 7.09k]
  ------------------
 1613|  56.7k|      const __m128i y_select = _mm_set1_epi32(y_mask);
 1614|  56.7k|      const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
 1615|  56.7k|      const __m128i scaled_bottom_left_y =
 1616|  56.7k|          _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
 1617|  56.7k|      write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
 1618|  56.7k|                                     scaled_bottom_left_y, scaled_bottom_left_y,
 1619|  56.7k|                                     round);
 1620|  56.7k|      dst += stride;
 1621|  56.7k|    }
 1622|  63.8k|    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1622:35): [True: 56.7k, False: 7.09k]
  ------------------
 1623|  56.7k|      const __m128i y_select = _mm_set1_epi32(y_mask);
 1624|  56.7k|      const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
 1625|  56.7k|      const __m128i scaled_bottom_left_y =
 1626|  56.7k|          _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
 1627|  56.7k|      write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
 1628|  56.7k|                                     scaled_bottom_left_y, scaled_bottom_left_y,
 1629|  56.7k|                                     round);
 1630|  56.7k|      dst += stride;
 1631|  56.7k|    }
 1632|  7.09k|  }
 1633|  1.77k|}
aom_smooth_v_predictor_32x8_ssse3:
 1638|  18.8k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1639|  18.8k|  const __m128i zero = _mm_setzero_si128();
 1640|  18.8k|  const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
 1641|  18.8k|  const __m128i top_lo = LoadUnaligned16(top_row);
 1642|  18.8k|  const __m128i top_hi = LoadUnaligned16(top_row + 16);
 1643|  18.8k|  const __m128i top1 = cvtepu8_epi16(top_lo);
 1644|  18.8k|  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
 1645|  18.8k|  const __m128i top3 = cvtepu8_epi16(top_hi);
 1646|  18.8k|  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
 1647|  18.8k|  __m128i scale = _mm_set1_epi16(256);
 1648|  18.8k|  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
 1649|  18.8k|  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
 1650|  18.8k|  const __m128i scaled_bottom_left =
 1651|  18.8k|      _mm_mullo_epi16(inverted_weights, bottom_left);
 1652|  18.8k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  18.8k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1653|   169k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1653:33): [True: 150k, False: 18.8k]
  ------------------
 1654|   150k|    __m128i y_select = _mm_set1_epi32(y_mask);
 1655|   150k|    const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
 1656|   150k|    const __m128i scaled_bottom_left_y =
 1657|   150k|        _mm_shuffle_epi8(scaled_bottom_left, y_select);
 1658|   150k|    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 1659|   150k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1660|   150k|                                   round);
 1661|   150k|    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 1662|   150k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1663|   150k|                                   round);
 1664|   150k|    dst += stride;
 1665|   150k|  }
 1666|  18.8k|}
aom_smooth_v_predictor_32x16_ssse3:
 1672|  9.13k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1673|  9.13k|  const __m128i zero = _mm_setzero_si128();
 1674|  9.13k|  const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
 1675|  9.13k|  const __m128i top_lo = LoadUnaligned16(top_row);
 1676|  9.13k|  const __m128i top_hi = LoadUnaligned16(top_row + 16);
 1677|  9.13k|  const __m128i top1 = cvtepu8_epi16(top_lo);
 1678|  9.13k|  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
 1679|  9.13k|  const __m128i top3 = cvtepu8_epi16(top_hi);
 1680|  9.13k|  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
 1681|  9.13k|  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
 1682|  9.13k|  const __m128i weights1 = cvtepu8_epi16(weights);
 1683|  9.13k|  const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
 1684|  9.13k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  9.13k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1685|  9.13k|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 1686|  9.13k|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 1687|  9.13k|  const __m128i scaled_bottom_left1 =
 1688|  9.13k|      _mm_mullo_epi16(inverted_weights1, bottom_left);
 1689|  9.13k|  const __m128i scaled_bottom_left2 =
 1690|  9.13k|      _mm_mullo_epi16(inverted_weights2, bottom_left);
 1691|  9.13k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  9.13k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1692|  82.1k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1692:33): [True: 73.0k, False: 9.13k]
  ------------------
 1693|  73.0k|    __m128i y_select = _mm_set1_epi32(y_mask);
 1694|  73.0k|    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
 1695|  73.0k|    const __m128i scaled_bottom_left_y =
 1696|  73.0k|        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
 1697|  73.0k|    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 1698|  73.0k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1699|  73.0k|                                   round);
 1700|  73.0k|    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 1701|  73.0k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1702|  73.0k|                                   round);
 1703|  73.0k|    dst += stride;
 1704|  73.0k|  }
 1705|  82.1k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1705:33): [True: 73.0k, False: 9.13k]
  ------------------
 1706|  73.0k|    __m128i y_select = _mm_set1_epi32(y_mask);
 1707|  73.0k|    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
 1708|  73.0k|    const __m128i scaled_bottom_left_y =
 1709|  73.0k|        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
 1710|  73.0k|    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 1711|  73.0k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1712|  73.0k|                                   round);
 1713|  73.0k|    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 1714|  73.0k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1715|  73.0k|                                   round);
 1716|  73.0k|    dst += stride;
 1717|  73.0k|  }
 1718|  9.13k|}
aom_smooth_v_predictor_32x32_ssse3:
 1723|  39.3k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1724|  39.3k|  const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
 1725|  39.3k|  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
 1726|  39.3k|  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
 1727|  39.3k|  const __m128i zero = _mm_setzero_si128();
 1728|  39.3k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  39.3k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1729|  39.3k|  const __m128i top_lo = LoadUnaligned16(top_row);
 1730|  39.3k|  const __m128i top_hi = LoadUnaligned16(top_row + 16);
 1731|  39.3k|  const __m128i top1 = cvtepu8_epi16(top_lo);
 1732|  39.3k|  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
 1733|  39.3k|  const __m128i top3 = cvtepu8_epi16(top_hi);
 1734|  39.3k|  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
 1735|  39.3k|  const __m128i weights1 = cvtepu8_epi16(weights_lo);
 1736|  39.3k|  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
 1737|  39.3k|  const __m128i weights3 = cvtepu8_epi16(weights_hi);
 1738|  39.3k|  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
 1739|  39.3k|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 1740|  39.3k|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 1741|  39.3k|  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
 1742|  39.3k|  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
 1743|  39.3k|  const __m128i scaled_bottom_left1 =
 1744|  39.3k|      _mm_mullo_epi16(inverted_weights1, bottom_left);
 1745|  39.3k|  const __m128i scaled_bottom_left2 =
 1746|  39.3k|      _mm_mullo_epi16(inverted_weights2, bottom_left);
 1747|  39.3k|  const __m128i scaled_bottom_left3 =
 1748|  39.3k|      _mm_mullo_epi16(inverted_weights3, bottom_left);
 1749|  39.3k|  const __m128i scaled_bottom_left4 =
 1750|  39.3k|      _mm_mullo_epi16(inverted_weights4, bottom_left);
 1751|  39.3k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  39.3k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1752|   353k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1752:33): [True: 314k, False: 39.3k]
  ------------------
 1753|   314k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1754|   314k|    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
 1755|   314k|    const __m128i scaled_bottom_left_y =
 1756|   314k|        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
 1757|   314k|    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 1758|   314k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1759|   314k|                                   round);
 1760|   314k|    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 1761|   314k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1762|   314k|                                   round);
 1763|   314k|    dst += stride;
 1764|   314k|  }
 1765|   353k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1765:33): [True: 314k, False: 39.3k]
  ------------------
 1766|   314k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1767|   314k|    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
 1768|   314k|    const __m128i scaled_bottom_left_y =
 1769|   314k|        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
 1770|   314k|    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 1771|   314k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1772|   314k|                                   round);
 1773|   314k|    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 1774|   314k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1775|   314k|                                   round);
 1776|   314k|    dst += stride;
 1777|   314k|  }
 1778|   353k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1778:33): [True: 314k, False: 39.3k]
  ------------------
 1779|   314k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1780|   314k|    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
 1781|   314k|    const __m128i scaled_bottom_left_y =
 1782|   314k|        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
 1783|   314k|    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 1784|   314k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1785|   314k|                                   round);
 1786|   314k|    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 1787|   314k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1788|   314k|                                   round);
 1789|   314k|    dst += stride;
 1790|   314k|  }
 1791|   353k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1791:33): [True: 314k, False: 39.3k]
  ------------------
 1792|   314k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1793|   314k|    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
 1794|   314k|    const __m128i scaled_bottom_left_y =
 1795|   314k|        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
 1796|   314k|    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 1797|   314k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1798|   314k|                                   round);
 1799|   314k|    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 1800|   314k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1801|   314k|                                   round);
 1802|   314k|    dst += stride;
 1803|   314k|  }
 1804|  39.3k|}
aom_smooth_v_predictor_32x64_ssse3:
 1809|    741|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1810|    741|  const __m128i zero = _mm_setzero_si128();
 1811|    741|  const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
 1812|    741|  const __m128i top_lo = LoadUnaligned16(top_row);
 1813|    741|  const __m128i top_hi = LoadUnaligned16(top_row + 16);
 1814|    741|  const __m128i top1 = cvtepu8_epi16(top_lo);
 1815|    741|  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
 1816|    741|  const __m128i top3 = cvtepu8_epi16(top_hi);
 1817|    741|  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
 1818|    741|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|    741|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1819|    741|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|    741|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1820|    741|  const uint8_t *weights_base_ptr = smooth_weights + 60;
 1821|  3.70k|  for (int left_offset = 0; left_offset < 64; left_offset += 16) {
  ------------------
  |  Branch (1821:29): [True: 2.96k, False: 741]
  ------------------
 1822|  2.96k|    const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
 1823|  2.96k|    const __m128i weights_lo = cvtepu8_epi16(weights);
 1824|  2.96k|    const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
 1825|  2.96k|    const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
 1826|  2.96k|    const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
 1827|  2.96k|    const __m128i scaled_bottom_left_lo =
 1828|  2.96k|        _mm_mullo_epi16(inverted_weights_lo, bottom_left);
 1829|  2.96k|    const __m128i scaled_bottom_left_hi =
 1830|  2.96k|        _mm_mullo_epi16(inverted_weights_hi, bottom_left);
 1831|       |
 1832|  26.6k|    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1832:35): [True: 23.7k, False: 2.96k]
  ------------------
 1833|  23.7k|      const __m128i y_select = _mm_set1_epi32(y_mask);
 1834|  23.7k|      const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
 1835|  23.7k|      const __m128i scaled_bottom_left_y =
 1836|  23.7k|          _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
 1837|  23.7k|      write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 1838|  23.7k|                                     scaled_bottom_left_y, scaled_bottom_left_y,
 1839|  23.7k|                                     round);
 1840|  23.7k|      write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 1841|  23.7k|                                     scaled_bottom_left_y, scaled_bottom_left_y,
 1842|  23.7k|                                     round);
 1843|  23.7k|      dst += stride;
 1844|  23.7k|    }
 1845|  26.6k|    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1845:35): [True: 23.7k, False: 2.96k]
  ------------------
 1846|  23.7k|      const __m128i y_select = _mm_set1_epi32(y_mask);
 1847|  23.7k|      const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
 1848|  23.7k|      const __m128i scaled_bottom_left_y =
 1849|  23.7k|          _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
 1850|  23.7k|      write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 1851|  23.7k|                                     scaled_bottom_left_y, scaled_bottom_left_y,
 1852|  23.7k|                                     round);
 1853|  23.7k|      write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 1854|  23.7k|                                     scaled_bottom_left_y, scaled_bottom_left_y,
 1855|  23.7k|                                     round);
 1856|  23.7k|      dst += stride;
 1857|  23.7k|    }
 1858|  2.96k|  }
 1859|    741|}
aom_smooth_v_predictor_64x16_ssse3:
 1865|  13.5k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1866|  13.5k|  const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
 1867|  13.5k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  13.5k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1868|  13.5k|  const __m128i zero = _mm_setzero_si128();
 1869|  13.5k|  const __m128i top_lolo = LoadUnaligned16(top_row);
 1870|  13.5k|  const __m128i top_lohi = LoadUnaligned16(top_row + 16);
 1871|  13.5k|  const __m128i top1 = cvtepu8_epi16(top_lolo);
 1872|  13.5k|  const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
 1873|  13.5k|  const __m128i top3 = cvtepu8_epi16(top_lohi);
 1874|  13.5k|  const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
 1875|       |
 1876|  13.5k|  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
 1877|  13.5k|  const __m128i weights1 = cvtepu8_epi16(weights);
 1878|  13.5k|  const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
 1879|  13.5k|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 1880|  13.5k|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 1881|  13.5k|  const __m128i top_hilo = LoadUnaligned16(top_row + 32);
 1882|  13.5k|  const __m128i top_hihi = LoadUnaligned16(top_row + 48);
 1883|  13.5k|  const __m128i top5 = cvtepu8_epi16(top_hilo);
 1884|  13.5k|  const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
 1885|  13.5k|  const __m128i top7 = cvtepu8_epi16(top_hihi);
 1886|  13.5k|  const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
 1887|  13.5k|  const __m128i scaled_bottom_left1 =
 1888|  13.5k|      _mm_mullo_epi16(inverted_weights1, bottom_left);
 1889|  13.5k|  const __m128i scaled_bottom_left2 =
 1890|  13.5k|      _mm_mullo_epi16(inverted_weights2, bottom_left);
 1891|  13.5k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  13.5k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1892|   121k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1892:33): [True: 108k, False: 13.5k]
  ------------------
 1893|   108k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1894|   108k|    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
 1895|   108k|    const __m128i scaled_bottom_left_y =
 1896|   108k|        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
 1897|   108k|    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 1898|   108k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1899|   108k|                                   round);
 1900|   108k|    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 1901|   108k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1902|   108k|                                   round);
 1903|   108k|    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
 1904|   108k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1905|   108k|                                   round);
 1906|   108k|    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
 1907|   108k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1908|   108k|                                   round);
 1909|   108k|    dst += stride;
 1910|   108k|  }
 1911|   121k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1911:33): [True: 108k, False: 13.5k]
  ------------------
 1912|   108k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1913|   108k|    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
 1914|   108k|    const __m128i scaled_bottom_left_y =
 1915|   108k|        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
 1916|   108k|    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 1917|   108k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1918|   108k|                                   round);
 1919|   108k|    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 1920|   108k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1921|   108k|                                   round);
 1922|   108k|    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
 1923|   108k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1924|   108k|                                   round);
 1925|   108k|    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
 1926|   108k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1927|   108k|                                   round);
 1928|   108k|    dst += stride;
 1929|   108k|  }
 1930|  13.5k|}
aom_smooth_v_predictor_64x32_ssse3:
 1936|  2.41k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 1937|  2.41k|  const __m128i zero = _mm_setzero_si128();
 1938|  2.41k|  const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
 1939|  2.41k|  const __m128i top_lolo = LoadUnaligned16(top_row);
 1940|  2.41k|  const __m128i top_lohi = LoadUnaligned16(top_row + 16);
 1941|  2.41k|  const __m128i top1 = cvtepu8_epi16(top_lolo);
 1942|  2.41k|  const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
 1943|  2.41k|  const __m128i top3 = cvtepu8_epi16(top_lohi);
 1944|  2.41k|  const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
 1945|  2.41k|  const __m128i top_hilo = LoadUnaligned16(top_row + 32);
 1946|  2.41k|  const __m128i top_hihi = LoadUnaligned16(top_row + 48);
 1947|  2.41k|  const __m128i top5 = cvtepu8_epi16(top_hilo);
 1948|  2.41k|  const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
 1949|  2.41k|  const __m128i top7 = cvtepu8_epi16(top_hihi);
 1950|  2.41k|  const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
 1951|  2.41k|  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
 1952|  2.41k|  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
 1953|  2.41k|  const __m128i weights1 = cvtepu8_epi16(weights_lo);
 1954|  2.41k|  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
 1955|  2.41k|  const __m128i weights3 = cvtepu8_epi16(weights_hi);
 1956|  2.41k|  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
 1957|  2.41k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  2.41k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1958|  2.41k|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 1959|  2.41k|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 1960|  2.41k|  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
 1961|  2.41k|  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
 1962|  2.41k|  const __m128i scaled_bottom_left1 =
 1963|  2.41k|      _mm_mullo_epi16(inverted_weights1, bottom_left);
 1964|  2.41k|  const __m128i scaled_bottom_left2 =
 1965|  2.41k|      _mm_mullo_epi16(inverted_weights2, bottom_left);
 1966|  2.41k|  const __m128i scaled_bottom_left3 =
 1967|  2.41k|      _mm_mullo_epi16(inverted_weights3, bottom_left);
 1968|  2.41k|  const __m128i scaled_bottom_left4 =
 1969|  2.41k|      _mm_mullo_epi16(inverted_weights4, bottom_left);
 1970|  2.41k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  2.41k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1971|       |
 1972|  21.7k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1972:33): [True: 19.3k, False: 2.41k]
  ------------------
 1973|  19.3k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1974|  19.3k|    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
 1975|  19.3k|    const __m128i scaled_bottom_left_y =
 1976|  19.3k|        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
 1977|  19.3k|    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 1978|  19.3k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1979|  19.3k|                                   round);
 1980|  19.3k|    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 1981|  19.3k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1982|  19.3k|                                   round);
 1983|  19.3k|    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
 1984|  19.3k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1985|  19.3k|                                   round);
 1986|  19.3k|    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
 1987|  19.3k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1988|  19.3k|                                   round);
 1989|  19.3k|    dst += stride;
 1990|  19.3k|  }
 1991|  21.7k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (1991:33): [True: 19.3k, False: 2.41k]
  ------------------
 1992|  19.3k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 1993|  19.3k|    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
 1994|  19.3k|    const __m128i scaled_bottom_left_y =
 1995|  19.3k|        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
 1996|  19.3k|    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 1997|  19.3k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 1998|  19.3k|                                   round);
 1999|  19.3k|    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 2000|  19.3k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 2001|  19.3k|                                   round);
 2002|  19.3k|    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
 2003|  19.3k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 2004|  19.3k|                                   round);
 2005|  19.3k|    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
 2006|  19.3k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 2007|  19.3k|                                   round);
 2008|  19.3k|    dst += stride;
 2009|  19.3k|  }
 2010|  21.7k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2010:33): [True: 19.3k, False: 2.41k]
  ------------------
 2011|  19.3k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2012|  19.3k|    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
 2013|  19.3k|    const __m128i scaled_bottom_left_y =
 2014|  19.3k|        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
 2015|  19.3k|    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 2016|  19.3k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 2017|  19.3k|                                   round);
 2018|  19.3k|    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 2019|  19.3k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 2020|  19.3k|                                   round);
 2021|  19.3k|    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
 2022|  19.3k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 2023|  19.3k|                                   round);
 2024|  19.3k|    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
 2025|  19.3k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 2026|  19.3k|                                   round);
 2027|  19.3k|    dst += stride;
 2028|  19.3k|  }
 2029|  21.7k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2029:33): [True: 19.3k, False: 2.41k]
  ------------------
 2030|  19.3k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2031|  19.3k|    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
 2032|  19.3k|    const __m128i scaled_bottom_left_y =
 2033|  19.3k|        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
 2034|  19.3k|    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 2035|  19.3k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 2036|  19.3k|                                   round);
 2037|  19.3k|    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 2038|  19.3k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 2039|  19.3k|                                   round);
 2040|  19.3k|    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
 2041|  19.3k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 2042|  19.3k|                                   round);
 2043|  19.3k|    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
 2044|  19.3k|                                   scaled_bottom_left_y, scaled_bottom_left_y,
 2045|  19.3k|                                   round);
 2046|  19.3k|    dst += stride;
 2047|  19.3k|  }
 2048|  2.41k|}
aom_smooth_v_predictor_64x64_ssse3:
 2053|  5.79k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2054|  5.79k|  const __m128i zero = _mm_setzero_si128();
 2055|  5.79k|  const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
 2056|  5.79k|  const __m128i top_lolo = LoadUnaligned16(top_row);
 2057|  5.79k|  const __m128i top_lohi = LoadUnaligned16(top_row + 16);
 2058|  5.79k|  const __m128i top1 = cvtepu8_epi16(top_lolo);
 2059|  5.79k|  const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
 2060|  5.79k|  const __m128i top3 = cvtepu8_epi16(top_lohi);
 2061|  5.79k|  const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
 2062|  5.79k|  const __m128i top_hilo = LoadUnaligned16(top_row + 32);
 2063|  5.79k|  const __m128i top_hihi = LoadUnaligned16(top_row + 48);
 2064|  5.79k|  const __m128i top5 = cvtepu8_epi16(top_hilo);
 2065|  5.79k|  const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
 2066|  5.79k|  const __m128i top7 = cvtepu8_epi16(top_hihi);
 2067|  5.79k|  const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
 2068|  5.79k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  5.79k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2069|  5.79k|  const __m128i round = _mm_set1_epi16(128);
 2070|  5.79k|  const uint8_t *weights_base_ptr = smooth_weights + 60;
 2071|  28.9k|  for (int left_offset = 0; left_offset < 64; left_offset += 16) {
  ------------------
  |  Branch (2071:29): [True: 23.1k, False: 5.79k]
  ------------------
 2072|  23.1k|    const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
 2073|  23.1k|    const __m128i weights_lo = cvtepu8_epi16(weights);
 2074|  23.1k|    const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
 2075|  23.1k|    const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
 2076|  23.1k|    const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
 2077|  23.1k|    const __m128i scaled_bottom_left_lo =
 2078|  23.1k|        _mm_mullo_epi16(inverted_weights_lo, bottom_left);
 2079|  23.1k|    const __m128i scaled_bottom_left_hi =
 2080|  23.1k|        _mm_mullo_epi16(inverted_weights_hi, bottom_left);
 2081|   208k|    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2081:35): [True: 185k, False: 23.1k]
  ------------------
 2082|   185k|      const __m128i y_select = _mm_set1_epi32(y_mask);
 2083|   185k|      const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
 2084|   185k|      const __m128i scaled_bottom_left_y =
 2085|   185k|          _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
 2086|   185k|      write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 2087|   185k|                                     scaled_bottom_left_y, scaled_bottom_left_y,
 2088|   185k|                                     round);
 2089|   185k|      write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 2090|   185k|                                     scaled_bottom_left_y, scaled_bottom_left_y,
 2091|   185k|                                     round);
 2092|   185k|      write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
 2093|   185k|                                     scaled_bottom_left_y, scaled_bottom_left_y,
 2094|   185k|                                     round);
 2095|   185k|      write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
 2096|   185k|                                     scaled_bottom_left_y, scaled_bottom_left_y,
 2097|   185k|                                     round);
 2098|   185k|      dst += stride;
 2099|   185k|    }
 2100|   208k|    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2100:35): [True: 185k, False: 23.1k]
  ------------------
 2101|   185k|      const __m128i y_select = _mm_set1_epi32(y_mask);
 2102|   185k|      const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
 2103|   185k|      const __m128i scaled_bottom_left_y =
 2104|   185k|          _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
 2105|   185k|      write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
 2106|   185k|                                     scaled_bottom_left_y, scaled_bottom_left_y,
 2107|   185k|                                     round);
 2108|   185k|      write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
 2109|   185k|                                     scaled_bottom_left_y, scaled_bottom_left_y,
 2110|   185k|                                     round);
 2111|   185k|      write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
 2112|   185k|                                     scaled_bottom_left_y, scaled_bottom_left_y,
 2113|   185k|                                     round);
 2114|   185k|      write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
 2115|   185k|                                     scaled_bottom_left_y, scaled_bottom_left_y,
 2116|   185k|                                     round);
 2117|   185k|      dst += stride;
 2118|   185k|    }
 2119|  23.1k|  }
 2120|  5.79k|}
aom_smooth_h_predictor_4x4_ssse3:
 2138|  74.8k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2139|  74.8k|  const __m128i top_right = _mm_set1_epi32(top_row[3]);
 2140|  74.8k|  const __m128i left = cvtepu8_epi32(Load4(left_column));
 2141|  74.8k|  const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
 2142|  74.8k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  74.8k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2143|  74.8k|  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
 2144|  74.8k|  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
 2145|  74.8k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  74.8k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2146|  74.8k|  __m128i left_y = _mm_shuffle_epi32(left, 0);
 2147|  74.8k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2148|  74.8k|                               &round);
 2149|  74.8k|  dst += stride;
 2150|  74.8k|  left_y = _mm_shuffle_epi32(left, 0x55);
 2151|  74.8k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2152|  74.8k|                               &round);
 2153|  74.8k|  dst += stride;
 2154|  74.8k|  left_y = _mm_shuffle_epi32(left, 0xaa);
 2155|  74.8k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2156|  74.8k|                               &round);
 2157|  74.8k|  dst += stride;
 2158|  74.8k|  left_y = _mm_shuffle_epi32(left, 0xff);
 2159|  74.8k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2160|  74.8k|                               &round);
 2161|  74.8k|}
aom_smooth_h_predictor_4x8_ssse3:
 2166|  21.1k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2167|  21.1k|  const __m128i top_right = _mm_set1_epi32(top_row[3]);
 2168|  21.1k|  const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
 2169|  21.1k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  21.1k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2170|  21.1k|  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
 2171|  21.1k|  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
 2172|  21.1k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  21.1k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2173|  21.1k|  __m128i left = cvtepu8_epi32(Load4(left_column));
 2174|  21.1k|  __m128i left_y = _mm_shuffle_epi32(left, 0);
 2175|  21.1k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2176|  21.1k|                               &round);
 2177|  21.1k|  dst += stride;
 2178|  21.1k|  left_y = _mm_shuffle_epi32(left, 0x55);
 2179|  21.1k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2180|  21.1k|                               &round);
 2181|  21.1k|  dst += stride;
 2182|  21.1k|  left_y = _mm_shuffle_epi32(left, 0xaa);
 2183|  21.1k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2184|  21.1k|                               &round);
 2185|  21.1k|  dst += stride;
 2186|  21.1k|  left_y = _mm_shuffle_epi32(left, 0xff);
 2187|  21.1k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2188|  21.1k|                               &round);
 2189|  21.1k|  dst += stride;
 2190|       |
 2191|  21.1k|  left = cvtepu8_epi32(Load4(left_column + 4));
 2192|  21.1k|  left_y = _mm_shuffle_epi32(left, 0);
 2193|  21.1k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2194|  21.1k|                               &round);
 2195|  21.1k|  dst += stride;
 2196|  21.1k|  left_y = _mm_shuffle_epi32(left, 0x55);
 2197|  21.1k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2198|  21.1k|                               &round);
 2199|  21.1k|  dst += stride;
 2200|  21.1k|  left_y = _mm_shuffle_epi32(left, 0xaa);
 2201|  21.1k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2202|  21.1k|                               &round);
 2203|  21.1k|  dst += stride;
 2204|  21.1k|  left_y = _mm_shuffle_epi32(left, 0xff);
 2205|  21.1k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2206|  21.1k|                               &round);
 2207|  21.1k|}
aom_smooth_h_predictor_4x16_ssse3:
 2213|  15.9k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2214|  15.9k|  const __m128i top_right = _mm_set1_epi32(top_row[3]);
 2215|  15.9k|  const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
 2216|  15.9k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  15.9k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2217|  15.9k|  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
 2218|  15.9k|  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
 2219|  15.9k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  15.9k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2220|  15.9k|  __m128i left = cvtepu8_epi32(Load4(left_column));
 2221|  15.9k|  __m128i left_y = _mm_shuffle_epi32(left, 0);
 2222|  15.9k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2223|  15.9k|                               &round);
 2224|  15.9k|  dst += stride;
 2225|  15.9k|  left_y = _mm_shuffle_epi32(left, 0x55);
 2226|  15.9k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2227|  15.9k|                               &round);
 2228|  15.9k|  dst += stride;
 2229|  15.9k|  left_y = _mm_shuffle_epi32(left, 0xaa);
 2230|  15.9k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2231|  15.9k|                               &round);
 2232|  15.9k|  dst += stride;
 2233|  15.9k|  left_y = _mm_shuffle_epi32(left, 0xff);
 2234|  15.9k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2235|  15.9k|                               &round);
 2236|  15.9k|  dst += stride;
 2237|       |
 2238|  15.9k|  left = cvtepu8_epi32(Load4(left_column + 4));
 2239|  15.9k|  left_y = _mm_shuffle_epi32(left, 0);
 2240|  15.9k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2241|  15.9k|                               &round);
 2242|  15.9k|  dst += stride;
 2243|  15.9k|  left_y = _mm_shuffle_epi32(left, 0x55);
 2244|  15.9k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2245|  15.9k|                               &round);
 2246|  15.9k|  dst += stride;
 2247|  15.9k|  left_y = _mm_shuffle_epi32(left, 0xaa);
 2248|  15.9k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2249|  15.9k|                               &round);
 2250|  15.9k|  dst += stride;
 2251|  15.9k|  left_y = _mm_shuffle_epi32(left, 0xff);
 2252|  15.9k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2253|  15.9k|                               &round);
 2254|  15.9k|  dst += stride;
 2255|       |
 2256|  15.9k|  left = cvtepu8_epi32(Load4(left_column + 8));
 2257|  15.9k|  left_y = _mm_shuffle_epi32(left, 0);
 2258|  15.9k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2259|  15.9k|                               &round);
 2260|  15.9k|  dst += stride;
 2261|  15.9k|  left_y = _mm_shuffle_epi32(left, 0x55);
 2262|  15.9k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2263|  15.9k|                               &round);
 2264|  15.9k|  dst += stride;
 2265|  15.9k|  left_y = _mm_shuffle_epi32(left, 0xaa);
 2266|  15.9k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2267|  15.9k|                               &round);
 2268|  15.9k|  dst += stride;
 2269|  15.9k|  left_y = _mm_shuffle_epi32(left, 0xff);
 2270|  15.9k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2271|  15.9k|                               &round);
 2272|  15.9k|  dst += stride;
 2273|       |
 2274|  15.9k|  left = cvtepu8_epi32(Load4(left_column + 12));
 2275|  15.9k|  left_y = _mm_shuffle_epi32(left, 0);
 2276|  15.9k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2277|  15.9k|                               &round);
 2278|  15.9k|  dst += stride;
 2279|  15.9k|  left_y = _mm_shuffle_epi32(left, 0x55);
 2280|  15.9k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2281|  15.9k|                               &round);
 2282|  15.9k|  dst += stride;
 2283|  15.9k|  left_y = _mm_shuffle_epi32(left, 0xaa);
 2284|  15.9k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2285|  15.9k|                               &round);
 2286|  15.9k|  dst += stride;
 2287|  15.9k|  left_y = _mm_shuffle_epi32(left, 0xff);
 2288|  15.9k|  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
 2289|  15.9k|                               &round);
 2290|  15.9k|}
aom_smooth_h_predictor_8x4_ssse3:
 2299|  34.9k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2300|  34.9k|  const __m128i top_right = _mm_set1_epi16(top_row[7]);
 2301|  34.9k|  const __m128i left = cvtepu8_epi16(Load4(left_column));
 2302|  34.9k|  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
 2303|  34.9k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  34.9k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2304|  34.9k|  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
 2305|  34.9k|  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
 2306|  34.9k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  34.9k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2307|  34.9k|  __m128i y_select = _mm_set1_epi32(0x01000100);
 2308|  34.9k|  __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2309|  34.9k|  write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
 2310|  34.9k|                                &round);
 2311|  34.9k|  dst += stride;
 2312|  34.9k|  y_select = _mm_set1_epi32(0x03020302);
 2313|  34.9k|  left_y = _mm_shuffle_epi8(left, y_select);
 2314|  34.9k|  write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
 2315|  34.9k|                                &round);
 2316|  34.9k|  dst += stride;
 2317|  34.9k|  y_select = _mm_set1_epi32(0x05040504);
 2318|  34.9k|  left_y = _mm_shuffle_epi8(left, y_select);
 2319|  34.9k|  write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
 2320|  34.9k|                                &round);
 2321|  34.9k|  dst += stride;
 2322|  34.9k|  y_select = _mm_set1_epi32(0x07060706);
 2323|  34.9k|  left_y = _mm_shuffle_epi8(left, y_select);
 2324|  34.9k|  write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
 2325|  34.9k|                                &round);
 2326|  34.9k|}
aom_smooth_h_predictor_8x8_ssse3:
 2331|  49.1k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2332|  49.1k|  const __m128i top_right = _mm_set1_epi16(top_row[7]);
 2333|  49.1k|  const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
 2334|  49.1k|  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
 2335|  49.1k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  49.1k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2336|  49.1k|  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
 2337|  49.1k|  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
 2338|  49.1k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  49.1k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2339|   442k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2339:33): [True: 393k, False: 49.1k]
  ------------------
 2340|   393k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2341|   393k|    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2342|   393k|    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
 2343|   393k|                                  &round);
 2344|   393k|    dst += stride;
 2345|   393k|  }
 2346|  49.1k|}
aom_smooth_h_predictor_8x16_ssse3:
 2351|  17.8k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2352|  17.8k|  const __m128i top_right = _mm_set1_epi16(top_row[7]);
 2353|  17.8k|  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
 2354|  17.8k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  17.8k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2355|  17.8k|  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
 2356|  17.8k|  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
 2357|  17.8k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  17.8k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2358|  17.8k|  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
 2359|   160k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2359:33): [True: 142k, False: 17.8k]
  ------------------
 2360|   142k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2361|   142k|    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2362|   142k|    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
 2363|   142k|                                  &round);
 2364|   142k|    dst += stride;
 2365|   142k|  }
 2366|  17.8k|  left = cvtepu8_epi16(LoadLo8(left_column + 8));
 2367|   160k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2367:33): [True: 142k, False: 17.8k]
  ------------------
 2368|   142k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2369|   142k|    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2370|   142k|    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
 2371|   142k|                                  &round);
 2372|   142k|    dst += stride;
 2373|   142k|  }
 2374|  17.8k|}
aom_smooth_h_predictor_8x32_ssse3:
 2380|  6.02k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2381|  6.02k|  const __m128i top_right = _mm_set1_epi16(top_row[7]);
 2382|  6.02k|  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
 2383|  6.02k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  6.02k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2384|  6.02k|  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
 2385|  6.02k|  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
 2386|  6.02k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  6.02k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2387|  6.02k|  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
 2388|  54.1k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2388:33): [True: 48.1k, False: 6.02k]
  ------------------
 2389|  48.1k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2390|  48.1k|    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2391|  48.1k|    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
 2392|  48.1k|                                  &round);
 2393|  48.1k|    dst += stride;
 2394|  48.1k|  }
 2395|  6.02k|  left = cvtepu8_epi16(LoadLo8(left_column + 8));
 2396|  54.1k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2396:33): [True: 48.1k, False: 6.02k]
  ------------------
 2397|  48.1k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2398|  48.1k|    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2399|  48.1k|    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
 2400|  48.1k|                                  &round);
 2401|  48.1k|    dst += stride;
 2402|  48.1k|  }
 2403|  6.02k|  left = cvtepu8_epi16(LoadLo8(left_column + 16));
 2404|  54.1k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2404:33): [True: 48.1k, False: 6.02k]
  ------------------
 2405|  48.1k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2406|  48.1k|    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2407|  48.1k|    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
 2408|  48.1k|                                  &round);
 2409|  48.1k|    dst += stride;
 2410|  48.1k|  }
 2411|  6.02k|  left = cvtepu8_epi16(LoadLo8(left_column + 24));
 2412|  54.1k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2412:33): [True: 48.1k, False: 6.02k]
  ------------------
 2413|  48.1k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2414|  48.1k|    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2415|  48.1k|    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
 2416|  48.1k|                                  &round);
 2417|  48.1k|    dst += stride;
 2418|  48.1k|  }
 2419|  6.02k|}
aom_smooth_h_predictor_16x4_ssse3:
 2424|  23.9k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2425|  23.9k|  const __m128i top_right = _mm_set1_epi16(top_row[15]);
 2426|  23.9k|  const __m128i left = cvtepu8_epi16(Load4(left_column));
 2427|  23.9k|  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
 2428|  23.9k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  23.9k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2429|  23.9k|  const __m128i weights1 = cvtepu8_epi16(weights);
 2430|  23.9k|  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
 2431|  23.9k|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 2432|  23.9k|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 2433|  23.9k|  const __m128i scaled_top_right1 =
 2434|  23.9k|      _mm_mullo_epi16(inverted_weights1, top_right);
 2435|  23.9k|  const __m128i scaled_top_right2 =
 2436|  23.9k|      _mm_mullo_epi16(inverted_weights2, top_right);
 2437|  23.9k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  23.9k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2438|  23.9k|  __m128i y_mask = _mm_set1_epi32(0x01000100);
 2439|  23.9k|  __m128i left_y = _mm_shuffle_epi8(left, y_mask);
 2440|  23.9k|  write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2441|  23.9k|                                 scaled_top_right1, scaled_top_right2, round);
 2442|  23.9k|  dst += stride;
 2443|  23.9k|  y_mask = _mm_set1_epi32(0x03020302);
 2444|  23.9k|  left_y = _mm_shuffle_epi8(left, y_mask);
 2445|  23.9k|  write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2446|  23.9k|                                 scaled_top_right1, scaled_top_right2, round);
 2447|  23.9k|  dst += stride;
 2448|  23.9k|  y_mask = _mm_set1_epi32(0x05040504);
 2449|  23.9k|  left_y = _mm_shuffle_epi8(left, y_mask);
 2450|  23.9k|  write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2451|  23.9k|                                 scaled_top_right1, scaled_top_right2, round);
 2452|  23.9k|  dst += stride;
 2453|  23.9k|  y_mask = _mm_set1_epi32(0x07060706);
 2454|  23.9k|  left_y = _mm_shuffle_epi8(left, y_mask);
 2455|  23.9k|  write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2456|  23.9k|                                 scaled_top_right1, scaled_top_right2, round);
 2457|  23.9k|}
aom_smooth_h_predictor_16x8_ssse3:
 2463|  22.2k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2464|  22.2k|  const __m128i top_right = _mm_set1_epi16(top_row[15]);
 2465|  22.2k|  const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
 2466|  22.2k|  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
 2467|  22.2k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  22.2k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2468|  22.2k|  const __m128i weights1 = cvtepu8_epi16(weights);
 2469|  22.2k|  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
 2470|  22.2k|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 2471|  22.2k|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 2472|  22.2k|  const __m128i scaled_top_right1 =
 2473|  22.2k|      _mm_mullo_epi16(inverted_weights1, top_right);
 2474|  22.2k|  const __m128i scaled_top_right2 =
 2475|  22.2k|      _mm_mullo_epi16(inverted_weights2, top_right);
 2476|  22.2k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  22.2k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2477|   200k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2477:33): [True: 178k, False: 22.2k]
  ------------------
 2478|   178k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2479|   178k|    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2480|   178k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2481|   178k|                                   scaled_top_right1, scaled_top_right2, round);
 2482|   178k|    dst += stride;
 2483|   178k|  }
 2484|  22.2k|}
aom_smooth_h_predictor_16x16_ssse3:
 2489|  42.1k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2490|  42.1k|  const __m128i top_right = _mm_set1_epi16(top_row[15]);
 2491|  42.1k|  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
 2492|  42.1k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  42.1k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2493|  42.1k|  const __m128i weights1 = cvtepu8_epi16(weights);
 2494|  42.1k|  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
 2495|  42.1k|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 2496|  42.1k|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 2497|  42.1k|  const __m128i scaled_top_right1 =
 2498|  42.1k|      _mm_mullo_epi16(inverted_weights1, top_right);
 2499|  42.1k|  const __m128i scaled_top_right2 =
 2500|  42.1k|      _mm_mullo_epi16(inverted_weights2, top_right);
 2501|  42.1k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  42.1k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2502|  42.1k|  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
 2503|   379k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2503:33): [True: 337k, False: 42.1k]
  ------------------
 2504|   337k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2505|   337k|    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2506|   337k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2507|   337k|                                   scaled_top_right1, scaled_top_right2, round);
 2508|   337k|    dst += stride;
 2509|   337k|  }
 2510|  42.1k|  left = cvtepu8_epi16(LoadLo8(left_column + 8));
 2511|   379k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2511:33): [True: 337k, False: 42.1k]
  ------------------
 2512|   337k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2513|   337k|    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2514|   337k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2515|   337k|                                   scaled_top_right1, scaled_top_right2, round);
 2516|   337k|    dst += stride;
 2517|   337k|  }
 2518|  42.1k|}
aom_smooth_h_predictor_16x32_ssse3:
 2523|  12.3k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2524|  12.3k|  const __m128i top_right = _mm_set1_epi16(top_row[15]);
 2525|  12.3k|  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
 2526|  12.3k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  12.3k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2527|  12.3k|  const __m128i weights1 = cvtepu8_epi16(weights);
 2528|  12.3k|  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
 2529|  12.3k|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 2530|  12.3k|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 2531|  12.3k|  const __m128i scaled_top_right1 =
 2532|  12.3k|      _mm_mullo_epi16(inverted_weights1, top_right);
 2533|  12.3k|  const __m128i scaled_top_right2 =
 2534|  12.3k|      _mm_mullo_epi16(inverted_weights2, top_right);
 2535|  12.3k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  12.3k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2536|  12.3k|  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
 2537|   111k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2537:33): [True: 98.8k, False: 12.3k]
  ------------------
 2538|  98.8k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2539|  98.8k|    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2540|  98.8k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2541|  98.8k|                                   scaled_top_right1, scaled_top_right2, round);
 2542|  98.8k|    dst += stride;
 2543|  98.8k|  }
 2544|  12.3k|  left = cvtepu8_epi16(LoadLo8(left_column + 8));
 2545|   111k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2545:33): [True: 98.8k, False: 12.3k]
  ------------------
 2546|  98.8k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2547|  98.8k|    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2548|  98.8k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2549|  98.8k|                                   scaled_top_right1, scaled_top_right2, round);
 2550|  98.8k|    dst += stride;
 2551|  98.8k|  }
 2552|  12.3k|  left = cvtepu8_epi16(LoadLo8(left_column + 16));
 2553|   111k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2553:33): [True: 98.8k, False: 12.3k]
  ------------------
 2554|  98.8k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2555|  98.8k|    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2556|  98.8k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2557|  98.8k|                                   scaled_top_right1, scaled_top_right2, round);
 2558|  98.8k|    dst += stride;
 2559|  98.8k|  }
 2560|  12.3k|  left = cvtepu8_epi16(LoadLo8(left_column + 24));
 2561|   111k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2561:33): [True: 98.8k, False: 12.3k]
  ------------------
 2562|  98.8k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2563|  98.8k|    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2564|  98.8k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2565|  98.8k|                                   scaled_top_right1, scaled_top_right2, round);
 2566|  98.8k|    dst += stride;
 2567|  98.8k|  }
 2568|  12.3k|}
aom_smooth_h_predictor_16x64_ssse3:
 2574|  3.03k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2575|  3.03k|  const __m128i top_right = _mm_set1_epi16(top_row[15]);
 2576|  3.03k|  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
 2577|  3.03k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  3.03k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2578|  3.03k|  const __m128i weights1 = cvtepu8_epi16(weights);
 2579|  3.03k|  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
 2580|  3.03k|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 2581|  3.03k|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 2582|  3.03k|  const __m128i scaled_top_right1 =
 2583|  3.03k|      _mm_mullo_epi16(inverted_weights1, top_right);
 2584|  3.03k|  const __m128i scaled_top_right2 =
 2585|  3.03k|      _mm_mullo_epi16(inverted_weights2, top_right);
 2586|  3.03k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  3.03k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2587|  27.3k|  for (int left_offset = 0; left_offset < 64; left_offset += 8) {
  ------------------
  |  Branch (2587:29): [True: 24.3k, False: 3.03k]
  ------------------
 2588|  24.3k|    const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
 2589|   218k|    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2589:35): [True: 194k, False: 24.3k]
  ------------------
 2590|   194k|      const __m128i y_select = _mm_set1_epi32(y_mask);
 2591|   194k|      const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2592|   194k|      write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2593|   194k|                                     scaled_top_right1, scaled_top_right2,
 2594|   194k|                                     round);
 2595|   194k|      dst += stride;
 2596|   194k|    }
 2597|  24.3k|  }
 2598|  3.03k|}
aom_smooth_h_predictor_32x8_ssse3:
 2603|  18.1k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2604|  18.1k|  const __m128i top_right = _mm_set1_epi16(top_row[31]);
 2605|  18.1k|  const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
 2606|  18.1k|  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
 2607|  18.1k|  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
 2608|  18.1k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  18.1k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2609|  18.1k|  const __m128i weights1 = cvtepu8_epi16(weights_lo);
 2610|  18.1k|  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
 2611|  18.1k|  const __m128i weights3 = cvtepu8_epi16(weights_hi);
 2612|  18.1k|  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
 2613|  18.1k|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 2614|  18.1k|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 2615|  18.1k|  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
 2616|  18.1k|  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
 2617|  18.1k|  const __m128i scaled_top_right1 =
 2618|  18.1k|      _mm_mullo_epi16(inverted_weights1, top_right);
 2619|  18.1k|  const __m128i scaled_top_right2 =
 2620|  18.1k|      _mm_mullo_epi16(inverted_weights2, top_right);
 2621|  18.1k|  const __m128i scaled_top_right3 =
 2622|  18.1k|      _mm_mullo_epi16(inverted_weights3, top_right);
 2623|  18.1k|  const __m128i scaled_top_right4 =
 2624|  18.1k|      _mm_mullo_epi16(inverted_weights4, top_right);
 2625|  18.1k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  18.1k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2626|   163k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2626:33): [True: 145k, False: 18.1k]
  ------------------
 2627|   145k|    __m128i y_select = _mm_set1_epi32(y_mask);
 2628|   145k|    __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2629|   145k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2630|   145k|                                   scaled_top_right1, scaled_top_right2, round);
 2631|   145k|    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
 2632|   145k|                                   scaled_top_right3, scaled_top_right4, round);
 2633|   145k|    dst += stride;
 2634|   145k|  }
 2635|  18.1k|}
aom_smooth_h_predictor_32x16_ssse3:
 2641|  10.5k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2642|  10.5k|  const __m128i top_right = _mm_set1_epi16(top_row[31]);
 2643|  10.5k|  const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
 2644|  10.5k|  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
 2645|  10.5k|  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
 2646|  10.5k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  10.5k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2647|  10.5k|  const __m128i weights1 = cvtepu8_epi16(weights_lo);
 2648|  10.5k|  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
 2649|  10.5k|  const __m128i weights3 = cvtepu8_epi16(weights_hi);
 2650|  10.5k|  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
 2651|  10.5k|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 2652|  10.5k|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 2653|  10.5k|  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
 2654|  10.5k|  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
 2655|  10.5k|  const __m128i scaled_top_right1 =
 2656|  10.5k|      _mm_mullo_epi16(inverted_weights1, top_right);
 2657|  10.5k|  const __m128i scaled_top_right2 =
 2658|  10.5k|      _mm_mullo_epi16(inverted_weights2, top_right);
 2659|  10.5k|  const __m128i scaled_top_right3 =
 2660|  10.5k|      _mm_mullo_epi16(inverted_weights3, top_right);
 2661|  10.5k|  const __m128i scaled_top_right4 =
 2662|  10.5k|      _mm_mullo_epi16(inverted_weights4, top_right);
 2663|  10.5k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  10.5k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2664|  95.3k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2664:33): [True: 84.7k, False: 10.5k]
  ------------------
 2665|  84.7k|    __m128i y_select = _mm_set1_epi32(y_mask);
 2666|  84.7k|    __m128i left_y = _mm_shuffle_epi8(left1, y_select);
 2667|  84.7k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2668|  84.7k|                                   scaled_top_right1, scaled_top_right2, round);
 2669|  84.7k|    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
 2670|  84.7k|                                   scaled_top_right3, scaled_top_right4, round);
 2671|  84.7k|    dst += stride;
 2672|  84.7k|  }
 2673|  10.5k|  const __m128i left2 =
 2674|  10.5k|      cvtepu8_epi16(LoadLo8((const uint8_t *)left_column + 8));
 2675|  95.3k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2675:33): [True: 84.7k, False: 10.5k]
  ------------------
 2676|  84.7k|    __m128i y_select = _mm_set1_epi32(y_mask);
 2677|  84.7k|    __m128i left_y = _mm_shuffle_epi8(left2, y_select);
 2678|  84.7k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2679|  84.7k|                                   scaled_top_right1, scaled_top_right2, round);
 2680|  84.7k|    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
 2681|  84.7k|                                   scaled_top_right3, scaled_top_right4, round);
 2682|  84.7k|    dst += stride;
 2683|  84.7k|  }
 2684|  10.5k|}
aom_smooth_h_predictor_32x32_ssse3:
 2689|  41.4k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2690|  41.4k|  const __m128i top_right = _mm_set1_epi16(top_row[31]);
 2691|  41.4k|  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
 2692|  41.4k|  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
 2693|  41.4k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  41.4k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2694|  41.4k|  const __m128i weights1 = cvtepu8_epi16(weights_lo);
 2695|  41.4k|  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
 2696|  41.4k|  const __m128i weights3 = cvtepu8_epi16(weights_hi);
 2697|  41.4k|  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
 2698|  41.4k|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 2699|  41.4k|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 2700|  41.4k|  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
 2701|  41.4k|  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
 2702|  41.4k|  const __m128i scaled_top_right1 =
 2703|  41.4k|      _mm_mullo_epi16(inverted_weights1, top_right);
 2704|  41.4k|  const __m128i scaled_top_right2 =
 2705|  41.4k|      _mm_mullo_epi16(inverted_weights2, top_right);
 2706|  41.4k|  const __m128i scaled_top_right3 =
 2707|  41.4k|      _mm_mullo_epi16(inverted_weights3, top_right);
 2708|  41.4k|  const __m128i scaled_top_right4 =
 2709|  41.4k|      _mm_mullo_epi16(inverted_weights4, top_right);
 2710|  41.4k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  41.4k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2711|  41.4k|  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
 2712|   373k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2712:33): [True: 331k, False: 41.4k]
  ------------------
 2713|   331k|    __m128i y_select = _mm_set1_epi32(y_mask);
 2714|   331k|    __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2715|   331k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2716|   331k|                                   scaled_top_right1, scaled_top_right2, round);
 2717|   331k|    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
 2718|   331k|                                   scaled_top_right3, scaled_top_right4, round);
 2719|   331k|    dst += stride;
 2720|   331k|  }
 2721|  41.4k|  left = cvtepu8_epi16(LoadLo8(left_column + 8));
 2722|   373k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2722:33): [True: 331k, False: 41.4k]
  ------------------
 2723|   331k|    __m128i y_select = _mm_set1_epi32(y_mask);
 2724|   331k|    __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2725|   331k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2726|   331k|                                   scaled_top_right1, scaled_top_right2, round);
 2727|   331k|    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
 2728|   331k|                                   scaled_top_right3, scaled_top_right4, round);
 2729|   331k|    dst += stride;
 2730|   331k|  }
 2731|  41.4k|  left = cvtepu8_epi16(LoadLo8(left_column + 16));
 2732|   373k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2732:33): [True: 331k, False: 41.4k]
  ------------------
 2733|   331k|    __m128i y_select = _mm_set1_epi32(y_mask);
 2734|   331k|    __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2735|   331k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2736|   331k|                                   scaled_top_right1, scaled_top_right2, round);
 2737|   331k|    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
 2738|   331k|                                   scaled_top_right3, scaled_top_right4, round);
 2739|   331k|    dst += stride;
 2740|   331k|  }
 2741|  41.4k|  left = cvtepu8_epi16(LoadLo8(left_column + 24));
 2742|   373k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2742:33): [True: 331k, False: 41.4k]
  ------------------
 2743|   331k|    __m128i y_select = _mm_set1_epi32(y_mask);
 2744|   331k|    __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2745|   331k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2746|   331k|                                   scaled_top_right1, scaled_top_right2, round);
 2747|   331k|    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
 2748|   331k|                                   scaled_top_right3, scaled_top_right4, round);
 2749|   331k|    dst += stride;
 2750|   331k|  }
 2751|  41.4k|}
aom_smooth_h_predictor_32x64_ssse3:
 2756|    815|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2757|    815|  const __m128i top_right = _mm_set1_epi16(top_row[31]);
 2758|    815|  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
 2759|    815|  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
 2760|    815|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|    815|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2761|    815|  const __m128i weights1 = cvtepu8_epi16(weights_lo);
 2762|    815|  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
 2763|    815|  const __m128i weights3 = cvtepu8_epi16(weights_hi);
 2764|    815|  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
 2765|    815|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 2766|    815|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 2767|    815|  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
 2768|    815|  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
 2769|    815|  const __m128i scaled_top_right1 =
 2770|    815|      _mm_mullo_epi16(inverted_weights1, top_right);
 2771|    815|  const __m128i scaled_top_right2 =
 2772|    815|      _mm_mullo_epi16(inverted_weights2, top_right);
 2773|    815|  const __m128i scaled_top_right3 =
 2774|    815|      _mm_mullo_epi16(inverted_weights3, top_right);
 2775|    815|  const __m128i scaled_top_right4 =
 2776|    815|      _mm_mullo_epi16(inverted_weights4, top_right);
 2777|    815|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|    815|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2778|  7.33k|  for (int left_offset = 0; left_offset < 64; left_offset += 8) {
  ------------------
  |  Branch (2778:29): [True: 6.52k, False: 815]
  ------------------
 2779|  6.52k|    const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
 2780|  58.6k|    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2780:35): [True: 52.1k, False: 6.52k]
  ------------------
 2781|  52.1k|      const __m128i y_select = _mm_set1_epi32(y_mask);
 2782|  52.1k|      const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 2783|  52.1k|      write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2784|  52.1k|                                     scaled_top_right1, scaled_top_right2,
 2785|  52.1k|                                     round);
 2786|  52.1k|      write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
 2787|  52.1k|                                     weights4, scaled_top_right3,
 2788|  52.1k|                                     scaled_top_right4, round);
 2789|  52.1k|      dst += stride;
 2790|  52.1k|    }
 2791|  6.52k|  }
 2792|    815|}
aom_smooth_h_predictor_64x16_ssse3:
 2798|  5.70k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2799|  5.70k|  const __m128i top_right = _mm_set1_epi16(top_row[63]);
 2800|  5.70k|  const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
 2801|  5.70k|  const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
 2802|  5.70k|  const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
 2803|  5.70k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  5.70k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2804|  5.70k|  const __m128i weights1 = cvtepu8_epi16(weights_lolo);
 2805|  5.70k|  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
 2806|  5.70k|  const __m128i weights3 = cvtepu8_epi16(weights_lohi);
 2807|  5.70k|  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
 2808|  5.70k|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 2809|  5.70k|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 2810|  5.70k|  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
 2811|  5.70k|  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
 2812|  5.70k|  const __m128i scaled_top_right1 =
 2813|  5.70k|      _mm_mullo_epi16(inverted_weights1, top_right);
 2814|  5.70k|  const __m128i scaled_top_right2 =
 2815|  5.70k|      _mm_mullo_epi16(inverted_weights2, top_right);
 2816|  5.70k|  const __m128i scaled_top_right3 =
 2817|  5.70k|      _mm_mullo_epi16(inverted_weights3, top_right);
 2818|  5.70k|  const __m128i scaled_top_right4 =
 2819|  5.70k|      _mm_mullo_epi16(inverted_weights4, top_right);
 2820|  5.70k|  const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
 2821|  5.70k|  const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
 2822|  5.70k|  const __m128i weights5 = cvtepu8_epi16(weights_hilo);
 2823|  5.70k|  const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
 2824|  5.70k|  const __m128i weights7 = cvtepu8_epi16(weights_hihi);
 2825|  5.70k|  const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
 2826|  5.70k|  const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
 2827|  5.70k|  const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
 2828|  5.70k|  const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
 2829|  5.70k|  const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
 2830|  5.70k|  const __m128i scaled_top_right5 =
 2831|  5.70k|      _mm_mullo_epi16(inverted_weights5, top_right);
 2832|  5.70k|  const __m128i scaled_top_right6 =
 2833|  5.70k|      _mm_mullo_epi16(inverted_weights6, top_right);
 2834|  5.70k|  const __m128i scaled_top_right7 =
 2835|  5.70k|      _mm_mullo_epi16(inverted_weights7, top_right);
 2836|  5.70k|  const __m128i scaled_top_right8 =
 2837|  5.70k|      _mm_mullo_epi16(inverted_weights8, top_right);
 2838|  5.70k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  5.70k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2839|  51.3k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2839:33): [True: 45.6k, False: 5.70k]
  ------------------
 2840|  45.6k|    __m128i y_select = _mm_set1_epi32(y_mask);
 2841|  45.6k|    __m128i left_y = _mm_shuffle_epi8(left1, y_select);
 2842|  45.6k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2843|  45.6k|                                   scaled_top_right1, scaled_top_right2, round);
 2844|  45.6k|    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
 2845|  45.6k|                                   scaled_top_right3, scaled_top_right4, round);
 2846|  45.6k|    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
 2847|  45.6k|                                   scaled_top_right5, scaled_top_right6, round);
 2848|  45.6k|    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
 2849|  45.6k|                                   scaled_top_right7, scaled_top_right8, round);
 2850|  45.6k|    dst += stride;
 2851|  45.6k|  }
 2852|  5.70k|  const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
 2853|  51.3k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2853:33): [True: 45.6k, False: 5.70k]
  ------------------
 2854|  45.6k|    __m128i y_select = _mm_set1_epi32(y_mask);
 2855|  45.6k|    __m128i left_y = _mm_shuffle_epi8(left2, y_select);
 2856|  45.6k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2857|  45.6k|                                   scaled_top_right1, scaled_top_right2, round);
 2858|  45.6k|    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
 2859|  45.6k|                                   scaled_top_right3, scaled_top_right4, round);
 2860|  45.6k|    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
 2861|  45.6k|                                   scaled_top_right5, scaled_top_right6, round);
 2862|  45.6k|    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
 2863|  45.6k|                                   scaled_top_right7, scaled_top_right8, round);
 2864|  45.6k|    dst += stride;
 2865|  45.6k|  }
 2866|  5.70k|}
aom_smooth_h_predictor_64x32_ssse3:
 2872|  1.45k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2873|  1.45k|  const __m128i top_right = _mm_set1_epi16(top_row[63]);
 2874|  1.45k|  const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
 2875|  1.45k|  const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
 2876|  1.45k|  const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
 2877|  1.45k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  1.45k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2878|  1.45k|  const __m128i weights1 = cvtepu8_epi16(weights_lolo);
 2879|  1.45k|  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
 2880|  1.45k|  const __m128i weights3 = cvtepu8_epi16(weights_lohi);
 2881|  1.45k|  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
 2882|  1.45k|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 2883|  1.45k|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 2884|  1.45k|  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
 2885|  1.45k|  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
 2886|  1.45k|  const __m128i scaled_top_right1 =
 2887|  1.45k|      _mm_mullo_epi16(inverted_weights1, top_right);
 2888|  1.45k|  const __m128i scaled_top_right2 =
 2889|  1.45k|      _mm_mullo_epi16(inverted_weights2, top_right);
 2890|  1.45k|  const __m128i scaled_top_right3 =
 2891|  1.45k|      _mm_mullo_epi16(inverted_weights3, top_right);
 2892|  1.45k|  const __m128i scaled_top_right4 =
 2893|  1.45k|      _mm_mullo_epi16(inverted_weights4, top_right);
 2894|  1.45k|  const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
 2895|  1.45k|  const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
 2896|  1.45k|  const __m128i weights5 = cvtepu8_epi16(weights_hilo);
 2897|  1.45k|  const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
 2898|  1.45k|  const __m128i weights7 = cvtepu8_epi16(weights_hihi);
 2899|  1.45k|  const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
 2900|  1.45k|  const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
 2901|  1.45k|  const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
 2902|  1.45k|  const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
 2903|  1.45k|  const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
 2904|  1.45k|  const __m128i scaled_top_right5 =
 2905|  1.45k|      _mm_mullo_epi16(inverted_weights5, top_right);
 2906|  1.45k|  const __m128i scaled_top_right6 =
 2907|  1.45k|      _mm_mullo_epi16(inverted_weights6, top_right);
 2908|  1.45k|  const __m128i scaled_top_right7 =
 2909|  1.45k|      _mm_mullo_epi16(inverted_weights7, top_right);
 2910|  1.45k|  const __m128i scaled_top_right8 =
 2911|  1.45k|      _mm_mullo_epi16(inverted_weights8, top_right);
 2912|  1.45k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  1.45k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2913|  13.0k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2913:33): [True: 11.6k, False: 1.45k]
  ------------------
 2914|  11.6k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2915|  11.6k|    const __m128i left_y = _mm_shuffle_epi8(left1, y_select);
 2916|  11.6k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2917|  11.6k|                                   scaled_top_right1, scaled_top_right2, round);
 2918|  11.6k|    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
 2919|  11.6k|                                   scaled_top_right3, scaled_top_right4, round);
 2920|  11.6k|    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
 2921|  11.6k|                                   scaled_top_right5, scaled_top_right6, round);
 2922|  11.6k|    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
 2923|  11.6k|                                   scaled_top_right7, scaled_top_right8, round);
 2924|  11.6k|    dst += stride;
 2925|  11.6k|  }
 2926|  1.45k|  const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
 2927|  13.0k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2927:33): [True: 11.6k, False: 1.45k]
  ------------------
 2928|  11.6k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2929|  11.6k|    const __m128i left_y = _mm_shuffle_epi8(left2, y_select);
 2930|  11.6k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2931|  11.6k|                                   scaled_top_right1, scaled_top_right2, round);
 2932|  11.6k|    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
 2933|  11.6k|                                   scaled_top_right3, scaled_top_right4, round);
 2934|  11.6k|    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
 2935|  11.6k|                                   scaled_top_right5, scaled_top_right6, round);
 2936|  11.6k|    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
 2937|  11.6k|                                   scaled_top_right7, scaled_top_right8, round);
 2938|  11.6k|    dst += stride;
 2939|  11.6k|  }
 2940|  1.45k|  const __m128i left3 = cvtepu8_epi16(LoadLo8(left_column + 16));
 2941|  13.0k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2941:33): [True: 11.6k, False: 1.45k]
  ------------------
 2942|  11.6k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2943|  11.6k|    const __m128i left_y = _mm_shuffle_epi8(left3, y_select);
 2944|  11.6k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2945|  11.6k|                                   scaled_top_right1, scaled_top_right2, round);
 2946|  11.6k|    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
 2947|  11.6k|                                   scaled_top_right3, scaled_top_right4, round);
 2948|  11.6k|    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
 2949|  11.6k|                                   scaled_top_right5, scaled_top_right6, round);
 2950|  11.6k|    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
 2951|  11.6k|                                   scaled_top_right7, scaled_top_right8, round);
 2952|  11.6k|    dst += stride;
 2953|  11.6k|  }
 2954|  1.45k|  const __m128i left4 = cvtepu8_epi16(LoadLo8(left_column + 24));
 2955|  13.0k|  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (2955:33): [True: 11.6k, False: 1.45k]
  ------------------
 2956|  11.6k|    const __m128i y_select = _mm_set1_epi32(y_mask);
 2957|  11.6k|    const __m128i left_y = _mm_shuffle_epi8(left4, y_select);
 2958|  11.6k|    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 2959|  11.6k|                                   scaled_top_right1, scaled_top_right2, round);
 2960|  11.6k|    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
 2961|  11.6k|                                   scaled_top_right3, scaled_top_right4, round);
 2962|  11.6k|    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
 2963|  11.6k|                                   scaled_top_right5, scaled_top_right6, round);
 2964|  11.6k|    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
 2965|  11.6k|                                   scaled_top_right7, scaled_top_right8, round);
 2966|  11.6k|    dst += stride;
 2967|  11.6k|  }
 2968|  1.45k|}
aom_smooth_h_predictor_64x64_ssse3:
 2973|  4.67k|    const uint8_t *LIBAOM_RESTRICT left_column) {
 2974|  4.67k|  const __m128i top_right = _mm_set1_epi16(top_row[63]);
 2975|  4.67k|  const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
 2976|  4.67k|  const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
 2977|  4.67k|  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  4.67k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 2978|  4.67k|  const __m128i weights1 = cvtepu8_epi16(weights_lolo);
 2979|  4.67k|  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
 2980|  4.67k|  const __m128i weights3 = cvtepu8_epi16(weights_lohi);
 2981|  4.67k|  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
 2982|  4.67k|  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
 2983|  4.67k|  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
 2984|  4.67k|  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
 2985|  4.67k|  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
 2986|  4.67k|  const __m128i scaled_top_right1 =
 2987|  4.67k|      _mm_mullo_epi16(inverted_weights1, top_right);
 2988|  4.67k|  const __m128i scaled_top_right2 =
 2989|  4.67k|      _mm_mullo_epi16(inverted_weights2, top_right);
 2990|  4.67k|  const __m128i scaled_top_right3 =
 2991|  4.67k|      _mm_mullo_epi16(inverted_weights3, top_right);
 2992|  4.67k|  const __m128i scaled_top_right4 =
 2993|  4.67k|      _mm_mullo_epi16(inverted_weights4, top_right);
 2994|  4.67k|  const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
 2995|  4.67k|  const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
 2996|  4.67k|  const __m128i weights5 = cvtepu8_epi16(weights_hilo);
 2997|  4.67k|  const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
 2998|  4.67k|  const __m128i weights7 = cvtepu8_epi16(weights_hihi);
 2999|  4.67k|  const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
 3000|  4.67k|  const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
 3001|  4.67k|  const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
 3002|  4.67k|  const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
 3003|  4.67k|  const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
 3004|  4.67k|  const __m128i scaled_top_right5 =
 3005|  4.67k|      _mm_mullo_epi16(inverted_weights5, top_right);
 3006|  4.67k|  const __m128i scaled_top_right6 =
 3007|  4.67k|      _mm_mullo_epi16(inverted_weights6, top_right);
 3008|  4.67k|  const __m128i scaled_top_right7 =
 3009|  4.67k|      _mm_mullo_epi16(inverted_weights7, top_right);
 3010|  4.67k|  const __m128i scaled_top_right8 =
 3011|  4.67k|      _mm_mullo_epi16(inverted_weights8, top_right);
 3012|  4.67k|  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
  ------------------
  |  |   19|  4.67k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 3013|  42.0k|  for (int left_offset = 0; left_offset < 64; left_offset += 8) {
  ------------------
  |  Branch (3013:29): [True: 37.3k, False: 4.67k]
  ------------------
 3014|  37.3k|    const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
 3015|   336k|    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
  ------------------
  |  Branch (3015:35): [True: 299k, False: 37.3k]
  ------------------
 3016|   299k|      const __m128i y_select = _mm_set1_epi32(y_mask);
 3017|   299k|      const __m128i left_y = _mm_shuffle_epi8(left, y_select);
 3018|   299k|      write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
 3019|   299k|                                     scaled_top_right1, scaled_top_right2,
 3020|   299k|                                     round);
 3021|   299k|      write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
 3022|   299k|                                     weights4, scaled_top_right3,
 3023|   299k|                                     scaled_top_right4, round);
 3024|   299k|      write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5,
 3025|   299k|                                     weights6, scaled_top_right5,
 3026|   299k|                                     scaled_top_right6, round);
 3027|   299k|      write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7,
 3028|   299k|                                     weights8, scaled_top_right7,
 3029|   299k|                                     scaled_top_right8, round);
 3030|   299k|      dst += stride;
 3031|   299k|    }
 3032|  37.3k|  }
 3033|  4.67k|}
intrapred_ssse3.c:paeth_8x1_pred:
   23|  7.53M|                                     const __m128i *topleft) {
   24|  7.53M|  const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft);
   25|       |
   26|  7.53M|  __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left));
   27|  7.53M|  __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top));
   28|  7.53M|  __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft));
   29|       |
   30|  7.53M|  __m128i mask1 = _mm_cmpgt_epi16(pl, pt);
   31|  7.53M|  mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl));
   32|  7.53M|  __m128i mask2 = _mm_cmpgt_epi16(pt, ptl);
   33|       |
   34|  7.53M|  pl = _mm_andnot_si128(mask1, *left);
   35|       |
   36|  7.53M|  ptl = _mm_and_si128(mask2, *topleft);
   37|  7.53M|  pt = _mm_andnot_si128(mask2, *top);
   38|  7.53M|  pt = _mm_or_si128(pt, ptl);
   39|  7.53M|  pt = _mm_and_si128(mask1, pt);
   40|       |
   41|  7.53M|  return _mm_or_si128(pl, pt);
   42|  7.53M|}
intrapred_ssse3.c:paeth_16x1_pred:
  198|   745k|                                      const __m128i *topleft) {
  199|   745k|  const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
  200|   745k|  const __m128i p1 = paeth_8x1_pred(left, top1, topleft);
  201|   745k|  return _mm_packus_epi16(p0, p1);
  202|   745k|}
intrapred_ssse3.c:load_pixel_w4:
  598|   296k|                                 int height, __m128i *pixels) {
  599|   296k|  __m128i d = _mm_cvtsi32_si128(((const int *)above)[0]);
  600|   296k|  if (height == 4)
  ------------------
  |  Branch (600:7): [True: 201k, False: 95.0k]
  ------------------
  601|   201k|    pixels[1] = _mm_cvtsi32_si128(((const int *)left)[0]);
  602|  95.0k|  else if (height == 8)
  ------------------
  |  Branch (602:12): [True: 62.6k, False: 32.4k]
  ------------------
  603|  62.6k|    pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
  604|  32.4k|  else
  605|  32.4k|    pixels[1] = _mm_loadu_si128(((const __m128i *)left));
  606|       |
  607|   296k|  pixels[2] = _mm_set1_epi16((int16_t)above[3]);
  608|       |
  609|   296k|  const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
  610|   296k|  const __m128i zero = _mm_setzero_si128();
  611|   296k|  d = _mm_unpacklo_epi8(d, zero);
  612|   296k|  pixels[0] = _mm_unpacklo_epi16(d, bp);
  613|   296k|}
intrapred_ssse3.c:load_weight_w4:
  621|   296k|                                  __m128i *weight_w) {
  622|   296k|  const __m128i zero = _mm_setzero_si128();
  623|   296k|  const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
  ------------------
  |  |   19|   296k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  624|   296k|  const __m128i t = _mm_cvtsi32_si128(((const int *)smooth_weights)[0]);
  625|   296k|  weight_h[0] = _mm_unpacklo_epi8(t, zero);
  626|   296k|  weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
  627|   296k|  weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
  628|       |
  629|   296k|  if (height == 8) {
  ------------------
  |  Branch (629:7): [True: 62.6k, False: 233k]
  ------------------
  630|  62.6k|    const __m128i weight = _mm_loadl_epi64((const __m128i *)&smooth_weights[4]);
  631|  62.6k|    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
  632|  62.6k|    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
  633|   233k|  } else if (height == 16) {
  ------------------
  |  Branch (633:14): [True: 32.4k, False: 201k]
  ------------------
  634|  32.4k|    const __m128i weight =
  635|  32.4k|        _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
  636|  32.4k|    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
  637|  32.4k|    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
  638|  32.4k|    weight_h[2] = _mm_unpackhi_epi8(weight, zero);
  639|  32.4k|    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
  640|  32.4k|  }
  641|   296k|}
intrapred_ssse3.c:smooth_pred_4xh:
  645|   328k|                                   ptrdiff_t stride, int second_half) {
  646|   328k|  const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE));
  ------------------
  |  |   19|   328k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  647|   328k|  const __m128i one = _mm_set1_epi16(1);
  648|   328k|  const __m128i inc = _mm_set1_epi16(0x202);
  649|   328k|  const __m128i gat = _mm_set1_epi32(0xc080400);
  650|   328k|  __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
  ------------------
  |  Branch (650:17): [True: 32.4k, False: 296k]
  ------------------
  651|   328k|                            : _mm_set1_epi16((short)0x8000);
  652|   328k|  __m128i d = _mm_set1_epi16(0x100);
  653|       |
  654|  2.15M|  for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (654:19): [True: 1.82M, False: 328k]
  ------------------
  655|  1.82M|    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
  656|  1.82M|    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
  657|  1.82M|    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
  658|  1.82M|    __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
  659|       |
  660|  1.82M|    __m128i b = _mm_shuffle_epi8(pixel[1], rep);
  661|  1.82M|    b = _mm_unpacklo_epi16(b, pixel[2]);
  662|  1.82M|    __m128i sum = _mm_madd_epi16(b, ww[0]);
  663|       |
  664|  1.82M|    sum = _mm_add_epi32(s, sum);
  665|  1.82M|    sum = _mm_add_epi32(sum, round);
  666|  1.82M|    sum = _mm_srai_epi32(sum, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  1.82M|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  667|       |
  668|  1.82M|    sum = _mm_shuffle_epi8(sum, gat);
  669|  1.82M|    *(int *)dst = _mm_cvtsi128_si32(sum);
  670|  1.82M|    dst += stride;
  671|       |
  672|  1.82M|    rep = _mm_add_epi16(rep, one);
  673|  1.82M|    d = _mm_add_epi16(d, inc);
  674|  1.82M|  }
  675|   328k|}
intrapred_ssse3.c:load_pixel_w8:
  724|   347k|                                 int height, __m128i *pixels) {
  725|   347k|  const __m128i zero = _mm_setzero_si128();
  726|   347k|  const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
  727|   347k|  __m128i d = _mm_loadl_epi64((const __m128i *)above);
  728|   347k|  d = _mm_unpacklo_epi8(d, zero);
  729|   347k|  pixels[0] = _mm_unpacklo_epi16(d, bp);
  730|   347k|  pixels[1] = _mm_unpackhi_epi16(d, bp);
  731|       |
  732|   347k|  pixels[3] = _mm_set1_epi16((int16_t)above[7]);
  733|       |
  734|   347k|  if (height == 4) {
  ------------------
  |  Branch (734:7): [True: 91.3k, False: 255k]
  ------------------
  735|  91.3k|    pixels[2] = _mm_cvtsi32_si128(((const int *)left)[0]);
  736|   255k|  } else if (height == 8) {
  ------------------
  |  Branch (736:14): [True: 186k, False: 69.4k]
  ------------------
  737|   186k|    pixels[2] = _mm_loadl_epi64((const __m128i *)left);
  738|   186k|  } else if (height == 16) {
  ------------------
  |  Branch (738:14): [True: 50.7k, False: 18.6k]
  ------------------
  739|  50.7k|    pixels[2] = _mm_load_si128((const __m128i *)left);
  740|  50.7k|  } else {
  741|  18.6k|    pixels[2] = _mm_load_si128((const __m128i *)left);
  742|  18.6k|    pixels[4] = pixels[0];
  743|  18.6k|    pixels[5] = pixels[1];
  744|  18.6k|    pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
  745|  18.6k|    pixels[7] = pixels[3];
  746|  18.6k|  }
  747|   347k|}
intrapred_ssse3.c:load_weight_w8:
  760|   347k|                                  __m128i *weight_w) {
  761|   347k|  const __m128i zero = _mm_setzero_si128();
  762|   347k|  const int we_offset = height < 8 ? 0 : 4;
  ------------------
  |  Branch (762:25): [True: 91.3k, False: 255k]
  ------------------
  763|   347k|  __m128i we = _mm_loadu_si128((const __m128i *)&smooth_weights[we_offset]);
  764|   347k|  weight_h[0] = _mm_unpacklo_epi8(we, zero);
  765|   347k|  const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
  ------------------
  |  |   19|   347k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  766|   347k|  weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
  767|       |
  768|   347k|  if (height == 4) {
  ------------------
  |  Branch (768:7): [True: 91.3k, False: 255k]
  ------------------
  769|  91.3k|    we = _mm_srli_si128(we, 4);
  770|  91.3k|    __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
  771|  91.3k|    __m128i tmp2 = _mm_sub_epi16(d, tmp1);
  772|  91.3k|    weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
  773|  91.3k|    weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
  774|   255k|  } else {
  775|   255k|    weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
  776|   255k|    weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
  777|   255k|  }
  778|       |
  779|   347k|  if (height == 16) {
  ------------------
  |  Branch (779:7): [True: 50.7k, False: 296k]
  ------------------
  780|  50.7k|    we = _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
  781|  50.7k|    weight_h[0] = _mm_unpacklo_epi8(we, zero);
  782|  50.7k|    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
  783|  50.7k|    weight_h[2] = _mm_unpackhi_epi8(we, zero);
  784|  50.7k|    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
  785|   296k|  } else if (height == 32) {
  ------------------
  |  Branch (785:14): [True: 18.6k, False: 277k]
  ------------------
  786|  18.6k|    const __m128i weight_lo =
  787|  18.6k|        _mm_loadu_si128((const __m128i *)&smooth_weights[28]);
  788|  18.6k|    weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
  789|  18.6k|    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
  790|  18.6k|    weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
  791|  18.6k|    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
  792|  18.6k|    const __m128i weight_hi =
  793|  18.6k|        _mm_loadu_si128((const __m128i *)&smooth_weights[28 + 16]);
  794|  18.6k|    weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
  795|  18.6k|    weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
  796|  18.6k|    weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
  797|  18.6k|    weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
  798|  18.6k|  }
  799|   347k|}
intrapred_ssse3.c:smooth_pred_8xh:
  803|   453k|                                   ptrdiff_t stride, int second_half) {
  804|   453k|  const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE));
  ------------------
  |  |   19|   453k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  805|   453k|  const __m128i one = _mm_set1_epi16(1);
  806|   453k|  const __m128i inc = _mm_set1_epi16(0x202);
  807|   453k|  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
  808|       |
  809|   453k|  __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
  ------------------
  |  Branch (809:17): [True: 88.0k, False: 365k]
  ------------------
  810|   453k|                            : _mm_set1_epi16((short)0x8000);
  811|   453k|  __m128i d = _mm_set1_epi16(0x100);
  812|       |
  813|   453k|  int i;
  814|  3.71M|  for (i = 0; i < h; ++i) {
  ------------------
  |  Branch (814:15): [True: 3.26M, False: 453k]
  ------------------
  815|  3.26M|    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
  816|  3.26M|    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
  817|  3.26M|    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
  818|  3.26M|    __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
  819|  3.26M|    __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
  820|       |
  821|  3.26M|    __m128i b = _mm_shuffle_epi8(pixels[2], rep);
  822|  3.26M|    b = _mm_unpacklo_epi16(b, pixels[3]);
  823|  3.26M|    __m128i sum0 = _mm_madd_epi16(b, ww[0]);
  824|  3.26M|    __m128i sum1 = _mm_madd_epi16(b, ww[1]);
  825|       |
  826|  3.26M|    s0 = _mm_add_epi32(s0, sum0);
  827|  3.26M|    s0 = _mm_add_epi32(s0, round);
  828|  3.26M|    s0 = _mm_srai_epi32(s0, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  3.26M|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  829|       |
  830|  3.26M|    s1 = _mm_add_epi32(s1, sum1);
  831|  3.26M|    s1 = _mm_add_epi32(s1, round);
  832|  3.26M|    s1 = _mm_srai_epi32(s1, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|  3.26M|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  833|       |
  834|  3.26M|    sum0 = _mm_packus_epi16(s0, s1);
  835|  3.26M|    sum0 = _mm_shuffle_epi8(sum0, gat);
  836|  3.26M|    _mm_storel_epi64((__m128i *)dst, sum0);
  837|  3.26M|    dst += stride;
  838|       |
  839|  3.26M|    rep = _mm_add_epi16(rep, one);
  840|  3.26M|    d = _mm_add_epi16(d, inc);
  841|  3.26M|  }
  842|   453k|}
intrapred_ssse3.c:smooth_predictor_wxh:
  960|   505k|                                 int width, int height) {
  961|   505k|  const uint8_t *const sm_weights_h = smooth_weights + height - 4;
  962|   505k|  const uint8_t *const sm_weights_w = smooth_weights + width - 4;
  963|   505k|  const __m128i zero = _mm_setzero_si128();
  964|   505k|  const __m128i scale_value = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|   505k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  965|   505k|  const __m128i bottom_left = _mm_cvtsi32_si128(left_column[height - 1]);
  966|   505k|  const __m128i top_right = _mm_set1_epi16(top_row[width - 1]);
  967|   505k|  const __m128i round = _mm_set1_epi32(1 << SMOOTH_WEIGHT_LOG2_SCALE);
  ------------------
  |  |   19|   505k|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
  968|  10.8M|  for (int y = 0; y < height; ++y) {
  ------------------
  |  Branch (968:19): [True: 10.3M, False: 505k]
  ------------------
  969|  10.3M|    const __m128i weights_y = _mm_cvtsi32_si128(sm_weights_h[y]);
  970|  10.3M|    const __m128i left_y = _mm_cvtsi32_si128(left_column[y]);
  971|  10.3M|    const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
  972|  10.3M|    __m128i scaled_bottom_left =
  973|  10.3M|        _mm_mullo_epi16(scale_m_weights_y, bottom_left);
  974|  10.3M|    const __m128i weight_left_y =
  975|  10.3M|        _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
  976|  10.3M|    scaled_bottom_left = _mm_add_epi32(scaled_bottom_left, round);
  977|  10.3M|    scaled_bottom_left = _mm_shuffle_epi32(scaled_bottom_left, 0);
  978|  54.1M|    for (int x = 0; x < width; x += 8) {
  ------------------
  |  Branch (978:21): [True: 43.7M, False: 10.3M]
  ------------------
  979|  43.7M|      const __m128i top_x = LoadLo8(top_row + x);
  980|  43.7M|      const __m128i weights_x = LoadLo8(sm_weights_w + x);
  981|  43.7M|      const __m128i top_weights_x = _mm_unpacklo_epi8(top_x, weights_x);
  982|  43.7M|      const __m128i top_weights_x_lo = cvtepu8_epi16(top_weights_x);
  983|  43.7M|      const __m128i top_weights_x_hi = _mm_unpackhi_epi8(top_weights_x, zero);
  984|       |
  985|       |      // Here opposite weights and pixels are multiplied, where the order of
  986|       |      // interleaving is indicated in the names.
  987|  43.7M|      __m128i pred_lo = _mm_madd_epi16(top_weights_x_lo, weight_left_y);
  988|  43.7M|      __m128i pred_hi = _mm_madd_epi16(top_weights_x_hi, weight_left_y);
  989|       |
  990|       |      // |scaled_bottom_left| is always scaled by the same weight each row, so
  991|       |      // we only derive |scaled_top_right| values here.
  992|  43.7M|      const __m128i inverted_weights_x =
  993|  43.7M|          _mm_sub_epi16(scale_value, cvtepu8_epi16(weights_x));
  994|  43.7M|      const __m128i scaled_top_right =
  995|  43.7M|          _mm_mullo_epi16(inverted_weights_x, top_right);
  996|  43.7M|      const __m128i scaled_top_right_lo = cvtepu16_epi32(scaled_top_right);
  997|  43.7M|      const __m128i scaled_top_right_hi =
  998|  43.7M|          _mm_unpackhi_epi16(scaled_top_right, zero);
  999|  43.7M|      pred_lo = _mm_add_epi32(pred_lo, scaled_bottom_left);
 1000|  43.7M|      pred_hi = _mm_add_epi32(pred_hi, scaled_bottom_left);
 1001|  43.7M|      pred_lo = _mm_add_epi32(pred_lo, scaled_top_right_lo);
 1002|  43.7M|      pred_hi = _mm_add_epi32(pred_hi, scaled_top_right_hi);
 1003|       |
 1004|       |      // The round value for RightShiftWithRounding was added with
 1005|       |      // |scaled_bottom_left|.
 1006|  43.7M|      pred_lo = _mm_srli_epi32(pred_lo, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
  ------------------
  |  |   19|  43.7M|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1007|  43.7M|      pred_hi = _mm_srli_epi32(pred_hi, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
  ------------------
  |  |   19|  43.7M|#define SMOOTH_WEIGHT_LOG2_SCALE 8
  ------------------
 1008|  43.7M|      const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
 1009|  43.7M|      StoreLo8(dst + x, _mm_packus_epi16(pred, pred));
 1010|  43.7M|    }
 1011|  10.3M|    dst += stride;
 1012|  10.3M|  }
 1013|   505k|}
intrapred_ssse3.c:cvtepu16_epi32:
  953|  43.7M|static AOM_FORCE_INLINE __m128i cvtepu16_epi32(__m128i x) {
  954|  43.7M|  return _mm_unpacklo_epi16((x), _mm_setzero_si128());
  955|  43.7M|}
intrapred_ssse3.c:StoreLo8:
  936|  45.5M|static AOM_FORCE_INLINE void StoreLo8(void *a, const __m128i v) {
  937|  45.5M|  _mm_storel_epi64((__m128i *)(a), v);
  938|  45.5M|}
intrapred_ssse3.c:load_smooth_vertical_pixels4:
 1135|  62.4k|    const int height, __m128i *pixels) {
 1136|  62.4k|  __m128i top = Load4(above);
 1137|  62.4k|  const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
 1138|  62.4k|  top = cvtepu8_epi16(top);
 1139|  62.4k|  pixels[0] = _mm_unpacklo_epi16(top, bottom_left);
 1140|  62.4k|}
intrapred_ssse3.c:load_smooth_vertical_weights4:
 1148|  62.4k|    __m128i *weights) {
 1149|  62.4k|  const __m128i inverter = _mm_set1_epi16(256);
 1150|       |
 1151|  62.4k|  if (height == 4) {
  ------------------
  |  Branch (1151:7): [True: 39.3k, False: 23.0k]
  ------------------
 1152|  39.3k|    const __m128i weight = Load4(weight_array);
 1153|  39.3k|    weights[0] = cvtepu8_epi16(weight);
 1154|  39.3k|    weights[1] = _mm_sub_epi16(inverter, weights[0]);
 1155|  39.3k|  } else if (height == 8) {
  ------------------
  |  Branch (1155:14): [True: 13.4k, False: 9.59k]
  ------------------
 1156|  13.4k|    const __m128i weight = LoadLo8(weight_array + 4);
 1157|  13.4k|    weights[0] = cvtepu8_epi16(weight);
 1158|  13.4k|    weights[1] = _mm_sub_epi16(inverter, weights[0]);
 1159|  13.4k|  } else {
 1160|  9.59k|    const __m128i weight = LoadUnaligned16(weight_array + 12);
 1161|  9.59k|    const __m128i zero = _mm_setzero_si128();
 1162|  9.59k|    weights[0] = cvtepu8_epi16(weight);
 1163|  9.59k|    weights[1] = _mm_sub_epi16(inverter, weights[0]);
 1164|  9.59k|    weights[2] = _mm_unpackhi_epi8(weight, zero);
 1165|  9.59k|    weights[3] = _mm_sub_epi16(inverter, weights[2]);
 1166|  9.59k|  }
 1167|  62.4k|}
intrapred_ssse3.c:write_smooth_vertical4xh:
 1171|  72.0k|    uint8_t *LIBAOM_RESTRICT dst, const ptrdiff_t stride) {
 1172|  72.0k|  const __m128i pred_round = _mm_set1_epi32(128);
 1173|  72.0k|  const __m128i mask_increment = _mm_set1_epi16(0x0202);
 1174|  72.0k|  const __m128i cvtepu8_epi32 = _mm_set1_epi32(0xC080400);
 1175|  72.0k|  __m128i y_select = _mm_set1_epi16(0x0100);
 1176|       |
 1177|   490k|  for (int y = 0; y < height; ++y) {
  ------------------
  |  Branch (1177:19): [True: 418k, False: 72.0k]
  ------------------
 1178|   418k|    const __m128i weight_y = _mm_shuffle_epi8(weight[0], y_select);
 1179|   418k|    const __m128i inverted_weight_y = _mm_shuffle_epi8(weight[1], y_select);
 1180|   418k|    const __m128i alternate_weights =
 1181|   418k|        _mm_unpacklo_epi16(weight_y, inverted_weight_y);
 1182|       |    // Here the pixel vector is top_row[0], corner, top_row[1], corner, ...
 1183|       |    // The madd instruction yields four results of the form:
 1184|       |    // (top_row[x] * weight[y] + corner * inverted_weight[y])
 1185|   418k|    __m128i sum = _mm_madd_epi16(pixel[0], alternate_weights);
 1186|   418k|    sum = _mm_add_epi32(sum, pred_round);
 1187|   418k|    sum = _mm_srai_epi32(sum, 8);
 1188|   418k|    sum = _mm_shuffle_epi8(sum, cvtepu8_epi32);
 1189|   418k|    Store4(dst, sum);
 1190|   418k|    dst += stride;
 1191|   418k|    y_select = _mm_add_epi16(y_select, mask_increment);
 1192|   418k|  }
 1193|  72.0k|}
intrapred_ssse3.c:Store4:
  931|  1.14M|static AOM_FORCE_INLINE void Store4(void *dst, const __m128i x) {
  932|  1.14M|  const int val = _mm_cvtsi128_si32(x);
  933|  1.14M|  memcpy(dst, &val, sizeof(val));
  934|  1.14M|}
intrapred_ssse3.c:cvtepu8_epi16:
  944|  89.7M|static AOM_FORCE_INLINE __m128i cvtepu8_epi16(__m128i x) {
  945|  89.7M|  return _mm_unpacklo_epi8((x), _mm_setzero_si128());
  946|  89.7M|}
intrapred_ssse3.c:Load4:
  911|   491k|static AOM_FORCE_INLINE __m128i Load4(const void *src) {
  912|       |  // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
  913|       |  // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
  914|       |  // movss instruction.
  915|       |  //
  916|       |  // Until compiler support of _mm_loadu_si32 is widespread, use of
  917|       |  // _mm_loadu_si32 is banned.
  918|   491k|  int val;
  919|   491k|  memcpy(&val, src, sizeof(val));
  920|   491k|  return _mm_cvtsi32_si128(val);
  921|   491k|}
intrapred_ssse3.c:LoadLo8:
  923|  88.3M|static AOM_FORCE_INLINE __m128i LoadLo8(const void *a) {
  924|  88.3M|  return _mm_loadl_epi64((const __m128i *)(a));
  925|  88.3M|}
intrapred_ssse3.c:write_smooth_directional_sum8:
 1122|  1.79M|    const __m128i *scaled_corner, const __m128i *round) {
 1123|  1.79M|  const __m128i pred_sum =
 1124|  1.79M|      smooth_directional_sum8(*pixels, *weights, *scaled_corner);
 1125|       |  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
 1126|  1.79M|  const __m128i pred = _mm_srli_epi16(_mm_add_epi16(pred_sum, *round), 8);
 1127|  1.79M|  StoreLo8(dst, _mm_packus_epi16(pred, pred));
 1128|  1.79M|}
intrapred_ssse3.c:smooth_directional_sum8:
 1115|  1.79M|    const __m128i pixels, const __m128i weights, const __m128i scaled_corner) {
 1116|  1.79M|  const __m128i weighted_px = _mm_mullo_epi16(pixels, weights);
 1117|  1.79M|  return _mm_add_epi16(scaled_corner, weighted_px);
 1118|  1.79M|}
intrapred_ssse3.c:LoadUnaligned16:
  927|   815k|static AOM_FORCE_INLINE __m128i LoadUnaligned16(const void *a) {
  928|   815k|  return _mm_loadu_si128((const __m128i *)(a));
  929|   815k|}
intrapred_ssse3.c:write_smooth_directional_sum16:
 1103|  13.6M|    const __m128i round) {
 1104|  13.6M|  const __m128i weighted_px1 = _mm_mullo_epi16(pixels1, weights1);
 1105|  13.6M|  const __m128i weighted_px2 = _mm_mullo_epi16(pixels2, weights2);
 1106|  13.6M|  const __m128i pred_sum1 = _mm_add_epi16(scaled_corner1, weighted_px1);
 1107|  13.6M|  const __m128i pred_sum2 = _mm_add_epi16(scaled_corner2, weighted_px2);
 1108|       |  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
 1109|  13.6M|  const __m128i pred1 = _mm_srli_epi16(_mm_add_epi16(pred_sum1, round), 8);
 1110|  13.6M|  const __m128i pred2 = _mm_srli_epi16(_mm_add_epi16(pred_sum2, round), 8);
 1111|  13.6M|  StoreUnaligned16(dst, _mm_packus_epi16(pred1, pred2));
 1112|  13.6M|}
intrapred_ssse3.c:StoreUnaligned16:
  940|  13.6M|static AOM_FORCE_INLINE void StoreUnaligned16(void *a, const __m128i v) {
  941|  13.6M|  _mm_storeu_si128((__m128i *)(a), v);
  942|  13.6M|}
intrapred_ssse3.c:cvtepu8_epi32:
  948|   292k|static AOM_FORCE_INLINE __m128i cvtepu8_epi32(__m128i x) {
  949|   292k|  const __m128i tmp = _mm_unpacklo_epi8((x), _mm_setzero_si128());
  950|   292k|  return _mm_unpacklo_epi16(tmp, _mm_setzero_si128());
  951|   292k|}
intrapred_ssse3.c:write_smooth_horizontal_sum4:
 2126|   723k|    const __m128i *scaled_top_right, const __m128i *round) {
 2127|   723k|  const __m128i weighted_left_y = _mm_mullo_epi16(*left_y, *weights);
 2128|   723k|  const __m128i pred_sum = _mm_add_epi32(*scaled_top_right, weighted_left_y);
 2129|       |  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
 2130|   723k|  const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, *round), 8);
 2131|   723k|  const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
 2132|   723k|  Store4(dst, _mm_shuffle_epi8(pred, cvtepi32_epi8));
 2133|   723k|}

intrapred_avx2.c:transpose16x16_sse2:
   94|   615k|static inline void transpose16x16_sse2(__m128i *x, __m128i *d) {
   95|   615k|  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
   96|   615k|  __m128i w10, w11, w12, w13, w14, w15;
   97|       |
   98|   615k|  w0 = _mm_unpacklo_epi8(x[0], x[1]);
   99|   615k|  w1 = _mm_unpacklo_epi8(x[2], x[3]);
  100|   615k|  w2 = _mm_unpacklo_epi8(x[4], x[5]);
  101|   615k|  w3 = _mm_unpacklo_epi8(x[6], x[7]);
  102|       |
  103|   615k|  w8 = _mm_unpacklo_epi8(x[8], x[9]);
  104|   615k|  w9 = _mm_unpacklo_epi8(x[10], x[11]);
  105|   615k|  w10 = _mm_unpacklo_epi8(x[12], x[13]);
  106|   615k|  w11 = _mm_unpacklo_epi8(x[14], x[15]);
  107|       |
  108|   615k|  w4 = _mm_unpacklo_epi16(w0, w1);
  109|   615k|  w5 = _mm_unpacklo_epi16(w2, w3);
  110|   615k|  w12 = _mm_unpacklo_epi16(w8, w9);
  111|   615k|  w13 = _mm_unpacklo_epi16(w10, w11);
  112|       |
  113|   615k|  w6 = _mm_unpacklo_epi32(w4, w5);
  114|   615k|  w7 = _mm_unpackhi_epi32(w4, w5);
  115|   615k|  w14 = _mm_unpacklo_epi32(w12, w13);
  116|   615k|  w15 = _mm_unpackhi_epi32(w12, w13);
  117|       |
  118|       |  // Store first 4-line result
  119|   615k|  d[0] = _mm_unpacklo_epi64(w6, w14);
  120|   615k|  d[1] = _mm_unpackhi_epi64(w6, w14);
  121|   615k|  d[2] = _mm_unpacklo_epi64(w7, w15);
  122|   615k|  d[3] = _mm_unpackhi_epi64(w7, w15);
  123|       |
  124|   615k|  w4 = _mm_unpackhi_epi16(w0, w1);
  125|   615k|  w5 = _mm_unpackhi_epi16(w2, w3);
  126|   615k|  w12 = _mm_unpackhi_epi16(w8, w9);
  127|   615k|  w13 = _mm_unpackhi_epi16(w10, w11);
  128|       |
  129|   615k|  w6 = _mm_unpacklo_epi32(w4, w5);
  130|   615k|  w7 = _mm_unpackhi_epi32(w4, w5);
  131|   615k|  w14 = _mm_unpacklo_epi32(w12, w13);
  132|   615k|  w15 = _mm_unpackhi_epi32(w12, w13);
  133|       |
  134|       |  // Store second 4-line result
  135|   615k|  d[4] = _mm_unpacklo_epi64(w6, w14);
  136|   615k|  d[5] = _mm_unpackhi_epi64(w6, w14);
  137|   615k|  d[6] = _mm_unpacklo_epi64(w7, w15);
  138|   615k|  d[7] = _mm_unpackhi_epi64(w7, w15);
  139|       |
  140|       |  // upper half
  141|   615k|  w0 = _mm_unpackhi_epi8(x[0], x[1]);
  142|   615k|  w1 = _mm_unpackhi_epi8(x[2], x[3]);
  143|   615k|  w2 = _mm_unpackhi_epi8(x[4], x[5]);
  144|   615k|  w3 = _mm_unpackhi_epi8(x[6], x[7]);
  145|       |
  146|   615k|  w8 = _mm_unpackhi_epi8(x[8], x[9]);
  147|   615k|  w9 = _mm_unpackhi_epi8(x[10], x[11]);
  148|   615k|  w10 = _mm_unpackhi_epi8(x[12], x[13]);
  149|   615k|  w11 = _mm_unpackhi_epi8(x[14], x[15]);
  150|       |
  151|   615k|  w4 = _mm_unpacklo_epi16(w0, w1);
  152|   615k|  w5 = _mm_unpacklo_epi16(w2, w3);
  153|   615k|  w12 = _mm_unpacklo_epi16(w8, w9);
  154|   615k|  w13 = _mm_unpacklo_epi16(w10, w11);
  155|       |
  156|   615k|  w6 = _mm_unpacklo_epi32(w4, w5);
  157|   615k|  w7 = _mm_unpackhi_epi32(w4, w5);
  158|   615k|  w14 = _mm_unpacklo_epi32(w12, w13);
  159|   615k|  w15 = _mm_unpackhi_epi32(w12, w13);
  160|       |
  161|       |  // Store first 4-line result
  162|   615k|  d[8] = _mm_unpacklo_epi64(w6, w14);
  163|   615k|  d[9] = _mm_unpackhi_epi64(w6, w14);
  164|   615k|  d[10] = _mm_unpacklo_epi64(w7, w15);
  165|   615k|  d[11] = _mm_unpackhi_epi64(w7, w15);
  166|       |
  167|   615k|  w4 = _mm_unpackhi_epi16(w0, w1);
  168|   615k|  w5 = _mm_unpackhi_epi16(w2, w3);
  169|   615k|  w12 = _mm_unpackhi_epi16(w8, w9);
  170|   615k|  w13 = _mm_unpackhi_epi16(w10, w11);
  171|       |
  172|   615k|  w6 = _mm_unpacklo_epi32(w4, w5);
  173|   615k|  w7 = _mm_unpackhi_epi32(w4, w5);
  174|   615k|  w14 = _mm_unpacklo_epi32(w12, w13);
  175|   615k|  w15 = _mm_unpackhi_epi32(w12, w13);
  176|       |
  177|       |  // Store second 4-line result
  178|   615k|  d[12] = _mm_unpacklo_epi64(w6, w14);
  179|   615k|  d[13] = _mm_unpackhi_epi64(w6, w14);
  180|   615k|  d[14] = _mm_unpacklo_epi64(w7, w15);
  181|   615k|  d[15] = _mm_unpackhi_epi64(w7, w15);
  182|   615k|}
intrapred_avx2.c:transpose:
  198|  31.6k|                      ptrdiff_t pitchDst, int width, int height) {
  199|   152k|  for (int j = 0; j < height; j += 16)
  ------------------
  |  Branch (199:19): [True: 120k, False: 31.6k]
  ------------------
  200|   541k|    for (int i = 0; i < width; i += 16)
  ------------------
  |  Branch (200:21): [True: 420k, False: 120k]
  ------------------
  201|   420k|      transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
  202|   420k|                         dst + j * pitchDst + i, pitchDst);
  203|  31.6k|}
intrapred_avx2.c:transpose_TX_16X16:
  185|   420k|                               uint8_t *dst, ptrdiff_t pitchDst) {
  186|   420k|  __m128i r[16];
  187|   420k|  __m128i d[16];
  188|  7.14M|  for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (188:19): [True: 6.72M, False: 420k]
  ------------------
  189|  6.72M|    r[j] = _mm_loadu_si128((__m128i *)(src + j * pitchSrc));
  190|  6.72M|  }
  191|   420k|  transpose16x16_sse2(r, d);
  192|  7.14M|  for (int j = 0; j < 16; j++) {
  ------------------
  |  Branch (192:19): [True: 6.72M, False: 420k]
  ------------------
  193|  6.72M|    _mm_storeu_si128((__m128i *)(dst + j * pitchDst), d[j]);
  194|  6.72M|  }
  195|   420k|}
intrapred_avx2.c:transpose4x16_sse2:
   56|  17.3k|static inline void transpose4x16_sse2(__m128i *x, __m128i *d) {
   57|  17.3k|  __m128i w0, w1, w2, w3, ww0, ww1, ww2, ww3;
   58|  17.3k|  w0 = _mm_unpacklo_epi8(x[0], x[1]);
   59|  17.3k|  w1 = _mm_unpacklo_epi8(x[2], x[3]);
   60|  17.3k|  w2 = _mm_unpackhi_epi8(x[0], x[1]);
   61|  17.3k|  w3 = _mm_unpackhi_epi8(x[2], x[3]);
   62|       |
   63|  17.3k|  ww0 = _mm_unpacklo_epi16(w0, w1);
   64|  17.3k|  ww1 = _mm_unpacklo_epi16(w2, w3);
   65|  17.3k|  ww2 = _mm_unpackhi_epi16(w0, w1);
   66|  17.3k|  ww3 = _mm_unpackhi_epi16(w2, w3);
   67|       |
   68|  17.3k|  w0 = _mm_unpacklo_epi32(ww0, ww1);
   69|  17.3k|  w2 = _mm_unpacklo_epi32(ww2, ww3);
   70|  17.3k|  w1 = _mm_unpackhi_epi32(ww0, ww1);
   71|  17.3k|  w3 = _mm_unpackhi_epi32(ww2, ww3);
   72|       |
   73|  17.3k|  d[0] = _mm_unpacklo_epi64(w0, w2);
   74|  17.3k|  d[1] = _mm_unpackhi_epi64(w0, w2);
   75|  17.3k|  d[2] = _mm_unpacklo_epi64(w1, w3);
   76|  17.3k|  d[3] = _mm_unpackhi_epi64(w1, w3);
   77|       |
   78|  17.3k|  d[4] = _mm_srli_si128(d[0], 8);
   79|  17.3k|  d[5] = _mm_srli_si128(d[1], 8);
   80|  17.3k|  d[6] = _mm_srli_si128(d[2], 8);
   81|  17.3k|  d[7] = _mm_srli_si128(d[3], 8);
   82|       |
   83|  17.3k|  d[8] = _mm_srli_si128(d[0], 4);
   84|  17.3k|  d[9] = _mm_srli_si128(d[1], 4);
   85|  17.3k|  d[10] = _mm_srli_si128(d[2], 4);
   86|  17.3k|  d[11] = _mm_srli_si128(d[3], 4);
   87|       |
   88|  17.3k|  d[12] = _mm_srli_si128(d[0], 12);
   89|  17.3k|  d[13] = _mm_srli_si128(d[1], 12);
   90|  17.3k|  d[14] = _mm_srli_si128(d[2], 12);
   91|  17.3k|  d[15] = _mm_srli_si128(d[3], 12);
   92|  17.3k|}

intrapred_sse2.c:dc_sum_16_sse2:
   19|  1.54M|static inline __m128i dc_sum_16_sse2(const uint8_t *ref) {
   20|  1.54M|  __m128i x = _mm_load_si128((__m128i const *)ref);
   21|  1.54M|  const __m128i zero = _mm_setzero_si128();
   22|  1.54M|  x = _mm_sad_epu8(x, zero);
   23|  1.54M|  const __m128i high = _mm_unpackhi_epi64(x, x);
   24|  1.54M|  return _mm_add_epi16(x, high);
   25|  1.54M|}
intrapred_sse2.c:dc_sum_32_sse2:
   27|   639k|static inline __m128i dc_sum_32_sse2(const uint8_t *ref) {
   28|   639k|  __m128i x0 = _mm_load_si128((__m128i const *)ref);
   29|   639k|  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
   30|   639k|  const __m128i zero = _mm_setzero_si128();
   31|   639k|  x0 = _mm_sad_epu8(x0, zero);
   32|   639k|  x1 = _mm_sad_epu8(x1, zero);
   33|   639k|  x0 = _mm_add_epi16(x0, x1);
   34|   639k|  const __m128i high = _mm_unpackhi_epi64(x0, x0);
   35|   639k|  return _mm_add_epi16(x0, high);
   36|   639k|}
intrapred_avx2.c:dc_sum_32_sse2:
   27|   147k|static inline __m128i dc_sum_32_sse2(const uint8_t *ref) {
   28|   147k|  __m128i x0 = _mm_load_si128((__m128i const *)ref);
   29|   147k|  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
   30|   147k|  const __m128i zero = _mm_setzero_si128();
   31|   147k|  x0 = _mm_sad_epu8(x0, zero);
   32|   147k|  x1 = _mm_sad_epu8(x1, zero);
   33|   147k|  x0 = _mm_add_epi16(x0, x1);
   34|   147k|  const __m128i high = _mm_unpackhi_epi64(x0, x0);
   35|   147k|  return _mm_add_epi16(x0, high);
   36|   147k|}
intrapred_avx2.c:dc_sum_16_sse2:
   19|   208k|static inline __m128i dc_sum_16_sse2(const uint8_t *ref) {
   20|   208k|  __m128i x = _mm_load_si128((__m128i const *)ref);
   21|   208k|  const __m128i zero = _mm_setzero_si128();
   22|   208k|  x = _mm_sad_epu8(x, zero);
   23|   208k|  const __m128i high = _mm_unpackhi_epi64(x, x);
   24|   208k|  return _mm_add_epi16(x, high);
   25|   208k|}

aom_lpf_horizontal_4_sse2:
  331|  3.16M|                               const uint8_t *_thresh) {
  332|  3.16M|  const __m128i zero = _mm_setzero_si128();
  333|  3.16M|  __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
  334|  3.16M|                                     _mm_loadl_epi64((const __m128i *)_limit));
  335|  3.16M|  __m128i thresh =
  336|  3.16M|      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
  337|       |
  338|  3.16M|  __m128i qs1qs0, ps1ps0;
  339|  3.16M|  __m128i p1, p0, q0, q1;
  340|       |
  341|  3.16M|  p1 = xx_loadl_32(s - 2 * p);
  342|  3.16M|  p0 = xx_loadl_32(s - 1 * p);
  343|  3.16M|  q0 = xx_loadl_32(s - 0 * p);
  344|  3.16M|  q1 = xx_loadl_32(s + 1 * p);
  345|       |
  346|  3.16M|  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0);
  347|       |
  348|  3.16M|  xx_storel_32(s - 1 * p, ps1ps0);
  349|  3.16M|  xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 4));
  350|  3.16M|  xx_storel_32(s + 0 * p, qs1qs0);
  351|  3.16M|  xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 4));
  352|  3.16M|}
aom_lpf_vertical_4_sse2:
  356|   968k|                             const uint8_t *_thresh) {
  357|   968k|  __m128i p1p0, q1q0;
  358|   968k|  __m128i p1, p0, q0, q1;
  359|       |
  360|   968k|  const __m128i zero = _mm_setzero_si128();
  361|   968k|  __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
  362|   968k|                                     _mm_loadl_epi64((const __m128i *)_limit));
  363|   968k|  __m128i thresh =
  364|   968k|      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
  365|       |
  366|   968k|  __m128i x0, x1, x2, x3;
  367|   968k|  __m128i d0, d1, d2, d3;
  368|   968k|  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
  369|   968k|  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
  370|   968k|  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
  371|   968k|  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
  372|       |
  373|   968k|  transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &p1, &p0, &q0, &q1);
  374|       |
  375|   968k|  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0);
  376|       |
  377|       |  // Transpose 8x4 to 4x8
  378|   968k|  p1 = _mm_srli_si128(p1p0, 4);
  379|   968k|  q1 = _mm_srli_si128(q1q0, 4);
  380|       |
  381|   968k|  transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
  382|       |
  383|   968k|  xx_storel_32(s + 0 * p - 2, d0);
  384|   968k|  xx_storel_32(s + 1 * p - 2, d1);
  385|   968k|  xx_storel_32(s + 2 * p - 2, d2);
  386|   968k|  xx_storel_32(s + 3 * p - 2, d3);
  387|   968k|}
aom_lpf_horizontal_14_sse2:
  960|  6.75M|                                const unsigned char *_thresh) {
  961|  6.75M|  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
  962|  6.75M|  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
  963|  6.75M|  __m128i limit = _mm_load_si128((const __m128i *)_limit);
  964|  6.75M|  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
  965|       |
  966|  6.75M|  q4p4 = _mm_unpacklo_epi32(xx_loadl_32(s - 5 * p), xx_loadl_32(s + 4 * p));
  967|  6.75M|  q3p3 = _mm_unpacklo_epi32(xx_loadl_32(s - 4 * p), xx_loadl_32(s + 3 * p));
  968|  6.75M|  q2p2 = _mm_unpacklo_epi32(xx_loadl_32(s - 3 * p), xx_loadl_32(s + 2 * p));
  969|  6.75M|  q1p1 = _mm_unpacklo_epi32(xx_loadl_32(s - 2 * p), xx_loadl_32(s + 1 * p));
  970|       |
  971|  6.75M|  q0p0 = _mm_unpacklo_epi32(xx_loadl_32(s - 1 * p), xx_loadl_32(s - 0 * p));
  972|       |
  973|  6.75M|  q5p5 = _mm_unpacklo_epi32(xx_loadl_32(s - 6 * p), xx_loadl_32(s + 5 * p));
  974|       |
  975|  6.75M|  q6p6 = _mm_unpacklo_epi32(xx_loadl_32(s - 7 * p), xx_loadl_32(s + 6 * p));
  976|       |
  977|  6.75M|  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
  978|  6.75M|                       &limit, &thresh);
  979|       |
  980|  6.75M|  store_buffer_horz_8(q0p0, p, 0, s);
  981|  6.75M|  store_buffer_horz_8(q1p1, p, 1, s);
  982|  6.75M|  store_buffer_horz_8(q2p2, p, 2, s);
  983|  6.75M|  store_buffer_horz_8(q3p3, p, 3, s);
  984|  6.75M|  store_buffer_horz_8(q4p4, p, 4, s);
  985|  6.75M|  store_buffer_horz_8(q5p5, p, 5, s);
  986|  6.75M|}
aom_lpf_horizontal_6_sse2:
 1254|  6.43M|                               const unsigned char *_thresh) {
 1255|  6.43M|  __m128i p2, p1, p0, q0, q1, q2;
 1256|  6.43M|  __m128i p1p0, q1q0;
 1257|  6.43M|  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
 1258|  6.43M|  __m128i limit = _mm_load_si128((__m128i *)_limit);
 1259|  6.43M|  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
 1260|       |
 1261|  6.43M|  p2 = xx_loadl_32(s - 3 * p);
 1262|  6.43M|  p1 = xx_loadl_32(s - 2 * p);
 1263|  6.43M|  p0 = xx_loadl_32(s - 1 * p);
 1264|  6.43M|  q0 = xx_loadl_32(s - 0 * p);
 1265|  6.43M|  q1 = xx_loadl_32(s + 1 * p);
 1266|  6.43M|  q2 = xx_loadl_32(s + 2 * p);
 1267|       |
 1268|  6.43M|  lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
 1269|  6.43M|                      &limit, &thresh);
 1270|       |
 1271|  6.43M|  xx_storel_32(s - 1 * p, p1p0);
 1272|  6.43M|  xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
 1273|  6.43M|  xx_storel_32(s + 0 * p, q1q0);
 1274|  6.43M|  xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
 1275|  6.43M|}
aom_lpf_horizontal_8_sse2:
 1612|  2.78M|                               const unsigned char *_thresh) {
 1613|  2.78M|  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
 1614|  2.78M|  __m128i q1q0, p1p0;
 1615|  2.78M|  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
 1616|  2.78M|  __m128i limit = _mm_load_si128((const __m128i *)_limit);
 1617|  2.78M|  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
 1618|       |
 1619|  2.78M|  p3 = xx_loadl_32(s - 4 * p);
 1620|  2.78M|  p2 = xx_loadl_32(s - 3 * p);
 1621|  2.78M|  p1 = xx_loadl_32(s - 2 * p);
 1622|  2.78M|  p0 = xx_loadl_32(s - 1 * p);
 1623|  2.78M|  q0 = xx_loadl_32(s - 0 * p);
 1624|  2.78M|  q1 = xx_loadl_32(s + 1 * p);
 1625|  2.78M|  q2 = xx_loadl_32(s + 2 * p);
 1626|  2.78M|  q3 = xx_loadl_32(s + 3 * p);
 1627|       |
 1628|  2.78M|  lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
 1629|  2.78M|                      &blimit, &limit, &thresh);
 1630|       |
 1631|  2.78M|  xx_storel_32(s - 1 * p, p1p0);
 1632|  2.78M|  xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
 1633|  2.78M|  xx_storel_32(s + 0 * p, q1q0);
 1634|  2.78M|  xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
 1635|  2.78M|  xx_storel_32(s - 3 * p, p2);
 1636|  2.78M|  xx_storel_32(s + 2 * p, q2);
 1637|  2.78M|}
aom_lpf_vertical_6_sse2:
 1830|  5.06M|                             const unsigned char *_thresh) {
 1831|  5.06M|  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
 1832|  5.06M|  __m128i x2, x1, x0, x3;
 1833|  5.06M|  __m128i p0, q0;
 1834|  5.06M|  __m128i p1p0, q1q0;
 1835|  5.06M|  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
 1836|  5.06M|  __m128i limit = _mm_load_si128((__m128i *)_limit);
 1837|  5.06M|  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
 1838|       |
 1839|  5.06M|  x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
 1840|  5.06M|  x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
 1841|  5.06M|  x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
 1842|  5.06M|  x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
 1843|       |
 1844|  5.06M|  transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
 1845|  5.06M|                        &d7);
 1846|       |
 1847|  5.06M|  lpf_internal_6_sse2(&d0, &d5, &d1, &d4, &d2, &d3, &q1q0, &p1p0, &blimit,
 1848|  5.06M|                      &limit, &thresh);
 1849|       |
 1850|  5.06M|  p0 = _mm_srli_si128(p1p0, 4);
 1851|  5.06M|  q0 = _mm_srli_si128(q1q0, 4);
 1852|       |
 1853|  5.06M|  transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
 1854|       |
 1855|  5.06M|  xx_storel_32(s + 0 * p - 2, d0);
 1856|  5.06M|  xx_storel_32(s + 1 * p - 2, d1);
 1857|  5.06M|  xx_storel_32(s + 2 * p - 2, d2);
 1858|  5.06M|  xx_storel_32(s + 3 * p - 2, d3);
 1859|  5.06M|}
aom_lpf_vertical_8_sse2:
 1919|   541k|                             const unsigned char *_thresh) {
 1920|   541k|  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
 1921|       |
 1922|   541k|  __m128i p0, q0;
 1923|   541k|  __m128i x2, x1, x0, x3;
 1924|   541k|  __m128i q1q0, p1p0;
 1925|   541k|  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
 1926|   541k|  __m128i limit = _mm_load_si128((const __m128i *)_limit);
 1927|   541k|  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
 1928|       |
 1929|   541k|  x3 = _mm_loadl_epi64((__m128i *)((s - 4) + 0 * p));
 1930|   541k|  x2 = _mm_loadl_epi64((__m128i *)((s - 4) + 1 * p));
 1931|   541k|  x1 = _mm_loadl_epi64((__m128i *)((s - 4) + 2 * p));
 1932|   541k|  x0 = _mm_loadl_epi64((__m128i *)((s - 4) + 3 * p));
 1933|       |
 1934|   541k|  transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
 1935|   541k|                        &d7);
 1936|       |  // Loop filtering
 1937|   541k|  lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0,
 1938|   541k|                      &blimit, &limit, &thresh);
 1939|       |
 1940|   541k|  p0 = _mm_srli_si128(p1p0, 4);
 1941|   541k|  q0 = _mm_srli_si128(q1q0, 4);
 1942|       |
 1943|   541k|  transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, &d1,
 1944|   541k|                        &d2, &d3);
 1945|       |
 1946|   541k|  _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0);
 1947|   541k|  _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), d1);
 1948|   541k|  _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2);
 1949|   541k|  _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), d3);
 1950|   541k|}
aom_lpf_vertical_14_sse2:
 2010|  5.44M|                              const unsigned char *_thresh) {
 2011|  5.44M|  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
 2012|  5.44M|  __m128i x6, x5, x4, x3;
 2013|  5.44M|  __m128i pq0, pq1, pq2, pq3;
 2014|  5.44M|  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
 2015|  5.44M|  __m128i limit = _mm_load_si128((__m128i *)_limit);
 2016|  5.44M|  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
 2017|       |
 2018|  5.44M|  x6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
 2019|  5.44M|  x5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
 2020|  5.44M|  x4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
 2021|  5.44M|  x3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
 2022|       |
 2023|  5.44M|  transpose_pq_14_sse2(&x6, &x5, &x4, &x3, &q0p0, &q1p1, &q2p2, &q3p3, &q4p4,
 2024|  5.44M|                       &q5p5, &q6p6, &q7p7);
 2025|       |
 2026|  5.44M|  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
 2027|  5.44M|                       &limit, &thresh);
 2028|       |
 2029|  5.44M|  transpose_pq_14_inv_sse2(&q7p7, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
 2030|  5.44M|                           &q0p0, &pq0, &pq1, &pq2, &pq3);
 2031|  5.44M|  _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), pq0);
 2032|  5.44M|  _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), pq1);
 2033|  5.44M|  _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), pq2);
 2034|  5.44M|  _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), pq3);
 2035|  5.44M|}
loopfilter_sse2.c:lpf_internal_4_sse2:
  246|  3.84M|    __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
  247|  3.84M|  __m128i q1p1, q0p0, p1p0, q1q0;
  248|  3.84M|  __m128i abs_p0q0, abs_p1q1;
  249|  3.84M|  __m128i mask, flat, hev;
  250|  3.84M|  const __m128i zero = _mm_setzero_si128();
  251|       |
  252|  3.84M|  q1p1 = _mm_unpacklo_epi32(*p1, *q1);
  253|  3.84M|  q0p0 = _mm_unpacklo_epi32(*p0, *q0);
  254|       |
  255|  3.84M|  p1p0 = _mm_unpacklo_epi32(q0p0, q1p1);
  256|  3.84M|  q1q0 = _mm_srli_si128(p1p0, 8);
  257|       |
  258|       |  /* (abs(q1 - q0), abs(p1 - p0) */
  259|  3.84M|  flat = abs_diff(q1p1, q0p0);
  260|       |  /* abs(p1 - q1), abs(p0 - q0) */
  261|  3.84M|  __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
  262|       |
  263|       |  /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
  264|  3.84M|  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
  265|  3.84M|  hev = _mm_unpacklo_epi8(flat, zero);
  266|       |
  267|  3.84M|  hev = _mm_cmpgt_epi16(hev, *thresh);
  268|  3.84M|  hev = _mm_packs_epi16(hev, hev);
  269|  3.84M|  hev = _mm_unpacklo_epi32(hev, hev);
  270|       |
  271|  3.84M|  abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
  272|  3.84M|  abs_p1q1 = _mm_srli_si128(abs_p1q1p0q0, 4);           /* abs(p1 - q1) */
  273|  3.84M|  abs_p1q1 = _mm_unpacklo_epi8(abs_p1q1, abs_p1q1);
  274|  3.84M|  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
  275|  3.84M|  abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
  276|       |  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
  277|       |
  278|  3.84M|  mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
  279|  3.84M|  mask = _mm_unpacklo_epi32(mask, flat);
  280|  3.84M|  mask = _mm_subs_epu8(mask, *limit);
  281|  3.84M|  mask = _mm_cmpeq_epi8(mask, zero);
  282|  3.84M|  mask = _mm_and_si128(mask, _mm_srli_si128(mask, 4));
  283|       |
  284|  3.84M|  filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
  285|  3.84M|}
loopfilter_sse2.c:filter4_sse2:
  143|  17.1M|                                          __m128i *qs1qs0, __m128i *ps1ps0) {
  144|  17.1M|  __m128i filter, filter2filter1, work;
  145|  17.1M|  __m128i ps1ps0_work, qs1qs0_work;
  146|  17.1M|  __m128i hev1;
  147|  17.1M|  const __m128i t3t4 =
  148|  17.1M|      _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 4, 4, 4, 4);
  149|  17.1M|  const __m128i t80 = _mm_set1_epi8((char)0x80);
  150|  17.1M|  const __m128i ff = _mm_cmpeq_epi8(t80, t80);
  151|       |
  152|  17.1M|  ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
  153|  17.1M|  qs1qs0_work = _mm_xor_si128(*q1q0, t80);
  154|       |
  155|       |  /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
  156|  17.1M|  work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
  157|  17.1M|  filter = _mm_and_si128(_mm_srli_si128(work, 4), *hev);
  158|       |  /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
  159|  17.1M|  filter = _mm_subs_epi8(filter, work);
  160|  17.1M|  filter = _mm_subs_epi8(filter, work);
  161|  17.1M|  filter = _mm_subs_epi8(filter, work);  /* + 3 * (qs0 - ps0) */
  162|  17.1M|  filter = _mm_and_si128(filter, *mask); /* & mask */
  163|  17.1M|  filter = _mm_unpacklo_epi32(filter, filter);
  164|       |
  165|       |  /* filter1 = signed_char_clamp(filter + 4) >> 3; */
  166|       |  /* filter2 = signed_char_clamp(filter + 3) >> 3; */
  167|  17.1M|  filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
  168|  17.1M|  filter2filter1 =
  169|  17.1M|      _mm_unpacklo_epi8(filter2filter1, filter2filter1);  // goto 16 bit
  170|  17.1M|  filter2filter1 = _mm_srai_epi16(filter2filter1, 11);    /* >> 3 */
  171|  17.1M|  filter2filter1 = _mm_packs_epi16(filter2filter1, filter2filter1);
  172|       |
  173|       |  /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
  174|  17.1M|  filter = _mm_subs_epi8(filter2filter1, ff);  /* + 1 */
  175|  17.1M|  filter = _mm_unpacklo_epi8(filter, filter);  // goto 16 bit
  176|  17.1M|  filter = _mm_srai_epi16(filter, 9);          /* round */
  177|  17.1M|  filter = _mm_packs_epi16(filter, filter);
  178|  17.1M|  filter = _mm_andnot_si128(*hev, filter);
  179|  17.1M|  filter = _mm_unpacklo_epi32(filter, filter);
  180|       |
  181|  17.1M|  filter2filter1 = _mm_unpacklo_epi32(filter2filter1, filter);
  182|  17.1M|  hev1 = _mm_srli_si128(filter2filter1, 8);
  183|       |  /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
  184|  17.1M|  qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
  185|       |  /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
  186|  17.1M|  ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
  187|       |
  188|  17.1M|  *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
  189|  17.1M|  *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
  190|  17.1M|}
loopfilter_sse2.c:lpf_internal_14_sse2:
  700|  8.03M|    __m128i *thresh) {
  701|  8.03M|  const __m128i zero = _mm_setzero_si128();
  702|  8.03M|  const __m128i one = _mm_set1_epi8(1);
  703|  8.03M|  __m128i mask, hev, flat, flat2;
  704|  8.03M|  __m128i flat2_pq[6], flat_pq[3];
  705|  8.03M|  __m128i qs0ps0, qs1ps1;
  706|  8.03M|  __m128i p1p0, q1q0, qs1qs0, ps1ps0;
  707|  8.03M|  __m128i abs_p1p0;
  708|       |
  709|  8.03M|  p1p0 = _mm_unpacklo_epi32(*q0p0, *q1p1);
  710|  8.03M|  q1q0 = _mm_srli_si128(p1p0, 8);
  711|       |
  712|  8.03M|  __m128i fe, ff, work;
  713|  8.03M|  {
  714|  8.03M|    __m128i abs_p1q1, abs_p0q0, abs_q1q0;
  715|  8.03M|    abs_p1p0 = abs_diff(*q1p1, *q0p0);
  716|  8.03M|    abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
  717|  8.03M|    fe = _mm_set1_epi8((char)0xfe);
  718|  8.03M|    ff = _mm_cmpeq_epi8(fe, fe);
  719|  8.03M|    abs_p0q0 = abs_diff(p1p0, q1q0);
  720|  8.03M|    abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
  721|       |
  722|  8.03M|    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
  723|       |
  724|  8.03M|    hev = _mm_subs_epu8(flat, *thresh);
  725|  8.03M|    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
  726|       |    // replicate for the further "merged variables" usage
  727|  8.03M|    hev = _mm_unpacklo_epi32(hev, hev);
  728|       |
  729|  8.03M|    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
  730|  8.03M|    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
  731|  8.03M|    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
  732|  8.03M|    mask = _mm_unpacklo_epi32(mask, zero);
  733|  8.03M|    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
  734|       |    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
  735|  8.03M|    mask = _mm_max_epu8(abs_p1p0, mask);
  736|       |    // mask |= (abs(p1 - p0) > limit) * -1;
  737|       |    // mask |= (abs(q1 - q0) > limit) * -1;
  738|       |
  739|  8.03M|    work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
  740|  8.03M|    mask = _mm_max_epu8(work, mask);
  741|  8.03M|    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
  742|  8.03M|    mask = _mm_subs_epu8(mask, *limit);
  743|  8.03M|    mask = _mm_cmpeq_epi8(mask, zero);
  744|  8.03M|  }
  745|       |
  746|       |  // lp filter - the same for 6, 8 and 14 versions
  747|  8.03M|  filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
  748|  8.03M|  qs0ps0 = _mm_unpacklo_epi32(ps1ps0, qs1qs0);
  749|  8.03M|  qs1ps1 = _mm_srli_si128(qs0ps0, 8);
  750|       |  // loopfilter done
  751|       |
  752|  8.03M|  flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
  753|  8.03M|  flat = _mm_max_epu8(abs_p1p0, flat);
  754|  8.03M|  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
  755|  8.03M|  flat = _mm_subs_epu8(flat, one);
  756|  8.03M|  flat = _mm_cmpeq_epi8(flat, zero);
  757|  8.03M|  flat = _mm_and_si128(flat, mask);
  758|  8.03M|  flat = _mm_unpacklo_epi32(flat, flat);
  759|  8.03M|  flat = _mm_unpacklo_epi64(flat, flat);
  760|       |
  761|       |  // if flat ==0 then flat2 is zero as well and we don't need any calc below
  762|       |  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
  763|  8.03M|  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
  ------------------
  |  Branch (763:7): [True: 4.83M, False: 3.19M]
  ------------------
  764|       |    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  765|       |    // flat and wide flat calculations
  766|  4.83M|    __m128i q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
  767|  4.83M|    __m128i pq_16[7];
  768|  4.83M|    const __m128i eight = _mm_set1_epi16(8);
  769|  4.83M|    const __m128i four = _mm_set1_epi16(4);
  770|  4.83M|    __m128i sum_p6;
  771|  4.83M|    __m128i sum_p3;
  772|       |
  773|  4.83M|    pq_16[0] = _mm_unpacklo_epi8(*q0p0, zero);
  774|  4.83M|    pq_16[1] = _mm_unpacklo_epi8(*q1p1, zero);
  775|  4.83M|    pq_16[2] = _mm_unpacklo_epi8(*q2p2, zero);
  776|  4.83M|    pq_16[3] = _mm_unpacklo_epi8(*q3p3, zero);
  777|  4.83M|    pq_16[4] = _mm_unpacklo_epi8(*q4p4, zero);
  778|  4.83M|    pq_16[5] = _mm_unpacklo_epi8(*q5p5, zero);
  779|  4.83M|    pq_16[6] = _mm_unpacklo_epi8(*q6p6, zero);
  780|  4.83M|    q0_16 = _mm_srli_si128(pq_16[0], 8);
  781|  4.83M|    q1_16 = _mm_srli_si128(pq_16[1], 8);
  782|  4.83M|    q2_16 = _mm_srli_si128(pq_16[2], 8);
  783|  4.83M|    q3_16 = _mm_srli_si128(pq_16[3], 8);
  784|  4.83M|    q4_16 = _mm_srli_si128(pq_16[4], 8);
  785|  4.83M|    q5_16 = _mm_srli_si128(pq_16[5], 8);
  786|       |
  787|  4.83M|    __m128i flat_p[3], flat_q[3];
  788|  4.83M|    __m128i flat2_p[6], flat2_q[6];
  789|       |
  790|  4.83M|    __m128i work0, work0_0, work0_1, sum_p_0;
  791|  4.83M|    __m128i sum_p = _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[4], pq_16[3]));
  792|  4.83M|    __m128i sum_lp = _mm_add_epi16(pq_16[0], _mm_add_epi16(pq_16[2], pq_16[1]));
  793|  4.83M|    sum_p = _mm_add_epi16(sum_p, sum_lp);
  794|       |
  795|  4.83M|    __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
  796|  4.83M|    __m128i sum_q = _mm_srli_si128(sum_p, 8);
  797|       |
  798|  4.83M|    sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
  799|  4.83M|    sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
  800|       |
  801|  4.83M|    flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq_16[3], pq_16[0]));
  802|  4.83M|    flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q3_16, q0_16));
  803|       |
  804|  4.83M|    sum_p6 = _mm_add_epi16(pq_16[6], pq_16[6]);
  805|  4.83M|    sum_p3 = _mm_add_epi16(pq_16[3], pq_16[3]);
  806|       |
  807|  4.83M|    sum_q = _mm_sub_epi16(sum_p_0, pq_16[5]);
  808|  4.83M|    sum_p = _mm_sub_epi16(sum_p_0, q5_16);
  809|       |
  810|  4.83M|    work0_0 = _mm_add_epi16(_mm_add_epi16(pq_16[6], pq_16[0]), pq_16[1]);
  811|  4.83M|    work0_1 = _mm_add_epi16(
  812|  4.83M|        sum_p6, _mm_add_epi16(pq_16[1], _mm_add_epi16(pq_16[2], pq_16[0])));
  813|       |
  814|  4.83M|    sum_lq = _mm_sub_epi16(sum_lp, pq_16[2]);
  815|  4.83M|    sum_lp = _mm_sub_epi16(sum_lp, q2_16);
  816|       |
  817|  4.83M|    work0 = _mm_add_epi16(sum_p3, pq_16[1]);
  818|  4.83M|    flat_p[1] = _mm_add_epi16(sum_lp, work0);
  819|  4.83M|    flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
  820|       |
  821|  4.83M|    flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
  822|  4.83M|    flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
  823|  4.83M|    flat_pq[0] = _mm_packus_epi16(flat_pq[0], flat_pq[0]);
  824|  4.83M|    flat_pq[1] = _mm_packus_epi16(flat_pq[1], flat_pq[1]);
  825|       |
  826|  4.83M|    sum_lp = _mm_sub_epi16(sum_lp, q1_16);
  827|  4.83M|    sum_lq = _mm_sub_epi16(sum_lq, pq_16[1]);
  828|       |
  829|  4.83M|    sum_p3 = _mm_add_epi16(sum_p3, pq_16[3]);
  830|  4.83M|    work0 = _mm_add_epi16(sum_p3, pq_16[2]);
  831|       |
  832|  4.83M|    flat_p[2] = _mm_add_epi16(sum_lp, work0);
  833|  4.83M|    flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
  834|  4.83M|    flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
  835|  4.83M|    flat_pq[2] = _mm_packus_epi16(flat_pq[2], flat_pq[2]);
  836|       |
  837|       |    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  838|  4.83M|    flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
  839|       |
  840|  4.83M|    work = abs_diff(*q6p6, *q0p0);
  841|  4.83M|    flat2 = _mm_max_epu8(work, flat2);
  842|  4.83M|    flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 4));
  843|  4.83M|    flat2 = _mm_subs_epu8(flat2, one);
  844|  4.83M|    flat2 = _mm_cmpeq_epi8(flat2, zero);
  845|  4.83M|    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
  846|  4.83M|    flat2 = _mm_unpacklo_epi32(flat2, flat2);
  847|       |
  848|       |    // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  849|  4.83M|    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
  850|  4.83M|    flat_pq[0] = _mm_and_si128(flat, flat_pq[0]);
  851|  4.83M|    *q0p0 = _mm_or_si128(qs0ps0, flat_pq[0]);
  852|       |
  853|  4.83M|    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
  854|  4.83M|    flat_pq[1] = _mm_and_si128(flat, flat_pq[1]);
  855|  4.83M|    *q1p1 = _mm_or_si128(qs1ps1, flat_pq[1]);
  856|       |
  857|  4.83M|    *q2p2 = _mm_andnot_si128(flat, *q2p2);
  858|  4.83M|    flat_pq[2] = _mm_and_si128(flat, flat_pq[2]);
  859|  4.83M|    *q2p2 = _mm_or_si128(*q2p2, flat_pq[2]);
  860|       |
  861|  4.83M|    if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
  ------------------
  |  Branch (861:9): [True: 4.39M, False: 446k]
  ------------------
  862|  4.39M|      flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q0_16));
  863|  4.39M|      flat2_q[0] = _mm_add_epi16(
  864|  4.39M|          sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq_16[0]));
  865|       |
  866|  4.39M|      flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
  867|  4.39M|      flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
  868|       |
  869|  4.39M|      flat2_pq[0] =
  870|  4.39M|          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
  871|  4.39M|      flat2_pq[1] =
  872|  4.39M|          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
  873|  4.39M|      flat2_pq[0] = _mm_packus_epi16(flat2_pq[0], flat2_pq[0]);
  874|  4.39M|      flat2_pq[1] = _mm_packus_epi16(flat2_pq[1], flat2_pq[1]);
  875|       |
  876|  4.39M|      sum_p = _mm_sub_epi16(sum_p, q4_16);
  877|  4.39M|      sum_q = _mm_sub_epi16(sum_q, pq_16[4]);
  878|       |
  879|  4.39M|      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
  880|  4.39M|      work0 = _mm_add_epi16(
  881|  4.39M|          sum_p6, _mm_add_epi16(pq_16[2], _mm_add_epi16(pq_16[3], pq_16[1])));
  882|  4.39M|      flat2_p[2] = _mm_add_epi16(sum_p, work0);
  883|  4.39M|      flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
  884|  4.39M|      flat2_pq[2] =
  885|  4.39M|          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
  886|  4.39M|      flat2_pq[2] = _mm_packus_epi16(flat2_pq[2], flat2_pq[2]);
  887|       |
  888|  4.39M|      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
  889|  4.39M|      sum_p = _mm_sub_epi16(sum_p, q3_16);
  890|  4.39M|      sum_q = _mm_sub_epi16(sum_q, pq_16[3]);
  891|       |
  892|  4.39M|      work0 = _mm_add_epi16(
  893|  4.39M|          sum_p6, _mm_add_epi16(pq_16[3], _mm_add_epi16(pq_16[4], pq_16[2])));
  894|  4.39M|      flat2_p[3] = _mm_add_epi16(sum_p, work0);
  895|  4.39M|      flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
  896|  4.39M|      flat2_pq[3] =
  897|  4.39M|          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
  898|  4.39M|      flat2_pq[3] = _mm_packus_epi16(flat2_pq[3], flat2_pq[3]);
  899|       |
  900|  4.39M|      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
  901|  4.39M|      sum_p = _mm_sub_epi16(sum_p, q2_16);
  902|  4.39M|      sum_q = _mm_sub_epi16(sum_q, pq_16[2]);
  903|       |
  904|  4.39M|      work0 = _mm_add_epi16(
  905|  4.39M|          sum_p6, _mm_add_epi16(pq_16[4], _mm_add_epi16(pq_16[5], pq_16[3])));
  906|  4.39M|      flat2_p[4] = _mm_add_epi16(sum_p, work0);
  907|  4.39M|      flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
  908|  4.39M|      flat2_pq[4] =
  909|  4.39M|          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
  910|  4.39M|      flat2_pq[4] = _mm_packus_epi16(flat2_pq[4], flat2_pq[4]);
  911|       |
  912|  4.39M|      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
  913|  4.39M|      sum_p = _mm_sub_epi16(sum_p, q1_16);
  914|  4.39M|      sum_q = _mm_sub_epi16(sum_q, pq_16[1]);
  915|       |
  916|  4.39M|      work0 = _mm_add_epi16(
  917|  4.39M|          sum_p6, _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[6], pq_16[4])));
  918|  4.39M|      flat2_p[5] = _mm_add_epi16(sum_p, work0);
  919|  4.39M|      flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
  920|  4.39M|      flat2_pq[5] =
  921|  4.39M|          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
  922|  4.39M|      flat2_pq[5] = _mm_packus_epi16(flat2_pq[5], flat2_pq[5]);
  923|       |
  924|       |      // wide flat
  925|       |      // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  926|       |
  927|  4.39M|      *q0p0 = _mm_andnot_si128(flat2, *q0p0);
  928|  4.39M|      flat2_pq[0] = _mm_and_si128(flat2, flat2_pq[0]);
  929|  4.39M|      *q0p0 = _mm_or_si128(*q0p0, flat2_pq[0]);
  930|       |
  931|  4.39M|      *q1p1 = _mm_andnot_si128(flat2, *q1p1);
  932|  4.39M|      flat2_pq[1] = _mm_and_si128(flat2, flat2_pq[1]);
  933|  4.39M|      *q1p1 = _mm_or_si128(*q1p1, flat2_pq[1]);
  934|       |
  935|  4.39M|      *q2p2 = _mm_andnot_si128(flat2, *q2p2);
  936|  4.39M|      flat2_pq[2] = _mm_and_si128(flat2, flat2_pq[2]);
  937|  4.39M|      *q2p2 = _mm_or_si128(*q2p2, flat2_pq[2]);
  938|       |
  939|  4.39M|      *q3p3 = _mm_andnot_si128(flat2, *q3p3);
  940|  4.39M|      flat2_pq[3] = _mm_and_si128(flat2, flat2_pq[3]);
  941|  4.39M|      *q3p3 = _mm_or_si128(*q3p3, flat2_pq[3]);
  942|       |
  943|  4.39M|      *q4p4 = _mm_andnot_si128(flat2, *q4p4);
  944|  4.39M|      flat2_pq[4] = _mm_and_si128(flat2, flat2_pq[4]);
  945|  4.39M|      *q4p4 = _mm_or_si128(*q4p4, flat2_pq[4]);
  946|       |
  947|  4.39M|      *q5p5 = _mm_andnot_si128(flat2, *q5p5);
  948|  4.39M|      flat2_pq[5] = _mm_and_si128(flat2, flat2_pq[5]);
  949|  4.39M|      *q5p5 = _mm_or_si128(*q5p5, flat2_pq[5]);
  950|  4.39M|    }
  951|  4.83M|  } else {
  952|  3.19M|    *q0p0 = qs0ps0;
  953|  3.19M|    *q1p1 = qs1ps1;
  954|  3.19M|  }
  955|  8.03M|}
loopfilter_sse2.c:store_buffer_horz_8:
  389|  21.9M|static inline void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) {
  390|  21.9M|  xx_storel_32(s - (num + 1) * p, x);
  391|  21.9M|  xx_storel_32(s + num * p, _mm_srli_si128(x, 4));
  392|  21.9M|}
loopfilter_sse2.c:lpf_internal_6_sse2:
 1120|  9.52M|    __m128i *thresh) {
 1121|  9.52M|  const __m128i zero = _mm_setzero_si128();
 1122|  9.52M|  __m128i mask, hev, flat;
 1123|  9.52M|  __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
 1124|  9.52M|  __m128i pq2_16, q2_16, pq1_16, pq0_16, q0_16;
 1125|  9.52M|  __m128i ps1ps0, qs1qs0;
 1126|       |
 1127|  9.52M|  q2p2 = _mm_unpacklo_epi32(*p2, *q2);
 1128|  9.52M|  q1p1 = _mm_unpacklo_epi32(*p1, *q1);
 1129|  9.52M|  q0p0 = _mm_unpacklo_epi32(*p0, *q0);
 1130|       |
 1131|  9.52M|  *p1p0 = _mm_unpacklo_epi32(*p0, *p1);
 1132|  9.52M|  *q1q0 = _mm_unpacklo_epi32(*q0, *q1);
 1133|       |
 1134|  9.52M|  const __m128i one = _mm_set1_epi8(1);
 1135|  9.52M|  const __m128i fe = _mm_set1_epi8((char)0xfe);
 1136|  9.52M|  const __m128i ff = _mm_cmpeq_epi8(fe, fe);
 1137|  9.52M|  {
 1138|       |    // filter_mask and hev_mask
 1139|  9.52M|    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
 1140|  9.52M|    abs_p1p0 = abs_diff(q1p1, q0p0);
 1141|  9.52M|    abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
 1142|       |
 1143|  9.52M|    abs_p0q0 = abs_diff(*p1p0, *q1q0);
 1144|  9.52M|    abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
 1145|       |
 1146|       |    // considering sse doesn't have unsigned elements comparison the idea is
 1147|       |    // to find at least one case when X > limit, it means the corresponding
 1148|       |    // mask bit is set.
 1149|       |    // to achieve that we find global max value of all inputs of abs(x-y) or
 1150|       |    // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
 1151|       |    // otherwise - not
 1152|       |
 1153|  9.52M|    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
 1154|  9.52M|    hev = _mm_subs_epu8(flat, *thresh);
 1155|  9.52M|    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
 1156|       |    // replicate for the further "merged variables" usage
 1157|  9.52M|    hev = _mm_unpacklo_epi32(hev, hev);
 1158|       |
 1159|  9.52M|    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
 1160|  9.52M|    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
 1161|  9.52M|    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
 1162|  9.52M|    mask = _mm_unpacklo_epi32(mask, zero);
 1163|  9.52M|    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
 1164|       |    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
 1165|  9.52M|    mask = _mm_max_epu8(abs_p1p0, mask);
 1166|       |    // mask |= (abs(p1 - p0) > limit) * -1;
 1167|       |    // mask |= (abs(q1 - q0) > limit) * -1;
 1168|       |
 1169|  9.52M|    work = abs_diff(q2p2, q1p1);
 1170|  9.52M|    mask = _mm_max_epu8(work, mask);
 1171|  9.52M|    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
 1172|  9.52M|    mask = _mm_subs_epu8(mask, *limit);
 1173|  9.52M|    mask = _mm_cmpeq_epi8(mask, zero);
 1174|       |
 1175|       |    // lp filter - the same for 6, 8 and 14 versions
 1176|  9.52M|    filter4_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
 1177|       |
 1178|       |    // flat_mask
 1179|  9.52M|    flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
 1180|  9.52M|    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
 1181|  9.52M|    flat = _mm_subs_epu8(flat, one);
 1182|  9.52M|    flat = _mm_cmpeq_epi8(flat, zero);
 1183|  9.52M|    flat = _mm_and_si128(flat, mask);
 1184|       |    // replicate for the further "merged variables" usage
 1185|  9.52M|    flat = _mm_unpacklo_epi32(flat, flat);
 1186|  9.52M|    flat = _mm_unpacklo_epi64(flat, flat);
 1187|  9.52M|  }
 1188|       |
 1189|       |  // 5 tap filter
 1190|       |  // need it only if flat !=0
 1191|  9.52M|  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
  ------------------
  |  Branch (1191:7): [True: 6.78M, False: 2.73M]
  ------------------
 1192|  6.78M|    const __m128i four = _mm_set1_epi16(4);
 1193|  6.78M|    __m128i workp_a, workp_b, workp_c;
 1194|  6.78M|    __m128i pq0x2_pq1, pq1_pq2;
 1195|  6.78M|    pq2_16 = _mm_unpacklo_epi8(q2p2, zero);
 1196|  6.78M|    pq1_16 = _mm_unpacklo_epi8(q1p1, zero);
 1197|  6.78M|    pq0_16 = _mm_unpacklo_epi8(q0p0, zero);
 1198|  6.78M|    q0_16 = _mm_srli_si128(pq0_16, 8);
 1199|  6.78M|    q2_16 = _mm_srli_si128(pq2_16, 8);
 1200|       |
 1201|       |    // op1
 1202|  6.78M|    pq0x2_pq1 =
 1203|  6.78M|        _mm_add_epi16(_mm_add_epi16(pq0_16, pq0_16), pq1_16);  // p0 *2 + p1
 1204|  6.78M|    pq1_pq2 = _mm_add_epi16(pq1_16, pq2_16);                   // p1 + p2
 1205|  6.78M|    workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
 1206|  6.78M|                            pq1_pq2);  // p2 + p0 * 2 + p1 * 2 + 4
 1207|       |
 1208|  6.78M|    workp_b = _mm_add_epi16(_mm_add_epi16(pq2_16, pq2_16), q0_16);
 1209|  6.78M|    workp_b =
 1210|  6.78M|        _mm_add_epi16(workp_a, workp_b);  // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
 1211|       |
 1212|       |    // op0
 1213|  6.78M|    workp_c = _mm_srli_si128(pq0x2_pq1, 8);  // q0 * 2 + q1
 1214|  6.78M|    workp_a = _mm_add_epi16(workp_a,
 1215|  6.78M|                            workp_c);  // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
 1216|  6.78M|    workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
 1217|  6.78M|    workp_b = _mm_srli_epi16(workp_b, 3);
 1218|       |
 1219|  6.78M|    flat_p1p0 = _mm_packus_epi16(workp_b, workp_b);
 1220|       |
 1221|       |    // oq0
 1222|  6.78M|    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq2_16),
 1223|  6.78M|                            pq1_16);  // p0 * 2 + p1  + q0 * 2 + q1 + 4
 1224|  6.78M|    workp_b = _mm_srli_si128(pq1_pq2, 8);
 1225|  6.78M|    workp_a = _mm_add_epi16(
 1226|  6.78M|        workp_a, workp_b);  // p0 * 2 + p1  + q0 * 2 + q1 * 2 + q2 + 4
 1227|       |    // workp_shft0 = _mm_srli_epi16(workp_a, 3);
 1228|       |
 1229|       |    // oq1
 1230|  6.78M|    workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq1_16),
 1231|  6.78M|                            pq0_16);  // p0   + q0 * 2 + q1 * 2 + q2 + 4
 1232|  6.78M|    workp_b = _mm_add_epi16(q2_16, q2_16);
 1233|  6.78M|    workp_b =
 1234|  6.78M|        _mm_add_epi16(workp_c, workp_b);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
 1235|       |
 1236|  6.78M|    workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
 1237|  6.78M|    workp_a = _mm_srli_epi16(workp_a, 3);
 1238|       |
 1239|  6.78M|    flat_q0q1 = _mm_packus_epi16(workp_a, workp_a);
 1240|       |
 1241|  6.78M|    qs1qs0 = _mm_andnot_si128(flat, *q1q0);
 1242|  6.78M|    *q1q0 = _mm_and_si128(flat, flat_q0q1);
 1243|  6.78M|    *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
 1244|       |
 1245|  6.78M|    ps1ps0 = _mm_andnot_si128(flat, *p1p0);
 1246|  6.78M|    *p1p0 = _mm_and_si128(flat, flat_p1p0);
 1247|  6.78M|    *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
 1248|  6.78M|  }
 1249|  9.52M|}
loopfilter_sse2.c:lpf_internal_8_sse2:
 1313|  2.66M|    __m128i *blimit, __m128i *limit, __m128i *thresh) {
 1314|  2.66M|  const __m128i zero = _mm_setzero_si128();
 1315|  2.66M|  __m128i mask, hev, flat;
 1316|  2.66M|  __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
 1317|  2.66M|      flat_p1p0, flat_q0q1;
 1318|  2.66M|  __m128i q2p2, q1p1, q0p0;
 1319|  2.66M|  __m128i q1q0, p1p0, ps1ps0, qs1qs0;
 1320|  2.66M|  __m128i work_pq, opq2, pq2;
 1321|       |
 1322|  2.66M|  q3p3 = _mm_unpacklo_epi32(*p3, *q3);
 1323|  2.66M|  q2p2 = _mm_unpacklo_epi32(*p2, *q2);
 1324|  2.66M|  q1p1 = _mm_unpacklo_epi32(*p1, *q1);
 1325|  2.66M|  q0p0 = _mm_unpacklo_epi32(*p0, *q0);
 1326|       |
 1327|  2.66M|  p1p0 = _mm_unpacklo_epi32(q0p0, q1p1);  // p1p0 q1q0
 1328|  2.66M|  q1q0 = _mm_srli_si128(p1p0, 8);
 1329|       |
 1330|       |  // filter_mask and hev_mask
 1331|       |
 1332|       |  // considering sse doesn't have unsigned elements comparison the idea is to
 1333|       |  // find at least one case when X > limit, it means the corresponding  mask
 1334|       |  // bit is set.
 1335|       |  // to achieve that we find global max value of all inputs of abs(x-y) or
 1336|       |  // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
 1337|       |  // otherwise - not
 1338|       |
 1339|  2.66M|  const __m128i one = _mm_set1_epi8(1);
 1340|  2.66M|  const __m128i fe = _mm_set1_epi8((char)0xfe);
 1341|  2.66M|  const __m128i ff = _mm_cmpeq_epi8(fe, fe);
 1342|  2.66M|  __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
 1343|       |
 1344|  2.66M|  abs_p1p0 = abs_diff(q1p1, q0p0);
 1345|  2.66M|  abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
 1346|       |
 1347|  2.66M|  abs_p0q0 = abs_diff(p1p0, q1q0);
 1348|  2.66M|  abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
 1349|       |
 1350|  2.66M|  flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
 1351|  2.66M|  hev = _mm_subs_epu8(flat, *thresh);
 1352|  2.66M|  hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
 1353|       |  // replicate for the further "merged variables" usage
 1354|  2.66M|  hev = _mm_unpacklo_epi32(hev, hev);
 1355|       |
 1356|  2.66M|  abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
 1357|  2.66M|  abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
 1358|  2.66M|  mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
 1359|  2.66M|  mask = _mm_unpacklo_epi32(mask, zero);
 1360|  2.66M|  mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
 1361|       |  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
 1362|  2.66M|  mask = _mm_max_epu8(abs_p1p0, mask);
 1363|       |  // mask |= (abs(p1 - p0) > limit) * -1;
 1364|       |  // mask |= (abs(q1 - q0) > limit) * -1;
 1365|       |
 1366|  2.66M|  work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
 1367|       |
 1368|  2.66M|  mask = _mm_max_epu8(work, mask);
 1369|  2.66M|  mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
 1370|  2.66M|  mask = _mm_subs_epu8(mask, *limit);
 1371|  2.66M|  mask = _mm_cmpeq_epi8(mask, zero);
 1372|       |
 1373|       |  // lp filter - the same for 6, 8 and 14 versions
 1374|  2.66M|  filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
 1375|       |
 1376|       |  // flat_mask4
 1377|  2.66M|  flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
 1378|  2.66M|  flat = _mm_max_epu8(abs_p1p0, flat);
 1379|       |
 1380|  2.66M|  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
 1381|  2.66M|  flat = _mm_subs_epu8(flat, one);
 1382|  2.66M|  flat = _mm_cmpeq_epi8(flat, zero);
 1383|  2.66M|  flat = _mm_and_si128(flat, mask);
 1384|       |  // replicate for the further "merged variables" usage
 1385|  2.66M|  flat = _mm_unpacklo_epi32(flat, flat);
 1386|  2.66M|  flat = _mm_unpacklo_epi64(flat, flat);
 1387|       |
 1388|       |  // filter8 need it only if flat !=0
 1389|  2.66M|  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
  ------------------
  |  Branch (1389:7): [True: 814k, False: 1.84M]
  ------------------
 1390|   814k|    const __m128i four = _mm_set1_epi16(4);
 1391|   814k|    __m128i workp_a, workp_b, workp_c, workp_d, workp_shft1, workp_shft2;
 1392|   814k|    p2_16 = _mm_unpacklo_epi8(*p2, zero);
 1393|   814k|    p1_16 = _mm_unpacklo_epi8(*p1, zero);
 1394|   814k|    p0_16 = _mm_unpacklo_epi8(*p0, zero);
 1395|   814k|    q0_16 = _mm_unpacklo_epi8(*q0, zero);
 1396|   814k|    q1_16 = _mm_unpacklo_epi8(*q1, zero);
 1397|   814k|    q2_16 = _mm_unpacklo_epi8(*q2, zero);
 1398|   814k|    p3_16 = _mm_unpacklo_epi8(*p3, zero);
 1399|   814k|    q3_16 = _mm_unpacklo_epi8(*q3, zero);
 1400|       |
 1401|       |    // op2
 1402|   814k|    workp_a =
 1403|   814k|        _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
 1404|   814k|    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
 1405|   814k|    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
 1406|   814k|    workp_shft2 = _mm_add_epi16(workp_a, workp_b);
 1407|       |
 1408|       |    // op1
 1409|   814k|    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
 1410|   814k|    workp_c = _mm_add_epi16(workp_a, workp_b);
 1411|       |    // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
 1412|       |
 1413|       |    // op0
 1414|   814k|    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
 1415|   814k|    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
 1416|   814k|    workp_d = _mm_add_epi16(workp_a, workp_b);
 1417|       |    // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
 1418|       |
 1419|   814k|    workp_c = _mm_unpacklo_epi64(workp_d, workp_c);
 1420|   814k|    workp_c = _mm_srli_epi16(workp_c, 3);
 1421|   814k|    flat_p1p0 = _mm_packus_epi16(workp_c, workp_c);
 1422|       |
 1423|       |    // oq0
 1424|   814k|    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
 1425|   814k|    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
 1426|       |    // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
 1427|   814k|    workp_c = _mm_add_epi16(workp_a, workp_b);
 1428|       |
 1429|       |    // oq1
 1430|   814k|    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
 1431|   814k|    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
 1432|   814k|    workp_d = _mm_add_epi16(workp_a, workp_b);
 1433|       |    // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
 1434|       |
 1435|   814k|    workp_c = _mm_unpacklo_epi64(workp_c, workp_d);
 1436|   814k|    workp_c = _mm_srli_epi16(workp_c, 3);
 1437|   814k|    flat_q0q1 = _mm_packus_epi16(workp_c, workp_c);
 1438|       |
 1439|       |    // oq2
 1440|   814k|    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
 1441|   814k|    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
 1442|   814k|    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
 1443|       |
 1444|   814k|    workp_c = _mm_unpacklo_epi64(workp_shft2, workp_shft1);
 1445|   814k|    workp_c = _mm_srli_epi16(workp_c, 3);
 1446|       |
 1447|   814k|    opq2 = _mm_packus_epi16(workp_c, workp_c);
 1448|       |
 1449|   814k|    work_pq = _mm_andnot_si128(flat, q2p2);
 1450|   814k|    pq2 = _mm_and_si128(flat, opq2);
 1451|   814k|    *p2 = _mm_or_si128(work_pq, pq2);
 1452|   814k|    *q2 = _mm_srli_si128(*p2, 4);
 1453|       |
 1454|   814k|    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
 1455|   814k|    q1q0 = _mm_and_si128(flat, flat_q0q1);
 1456|   814k|    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
 1457|       |
 1458|   814k|    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
 1459|   814k|    p1p0 = _mm_and_si128(flat, flat_p1p0);
 1460|   814k|    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
 1461|   814k|  }
 1462|  2.66M|}
loopfilter_sse2.c:transpose_pq_14_sse2:
   33|  5.48M|                                        __m128i *q7p7) {
   34|  5.48M|  __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3;
   35|  5.48M|  w0 = _mm_unpacklo_epi8(
   36|  5.48M|      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
   37|  5.48M|  w1 = _mm_unpacklo_epi8(
   38|  5.48M|      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
   39|  5.48M|  w2 = _mm_unpackhi_epi8(
   40|  5.48M|      *x0, *x1);  // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115
   41|  5.48M|  w3 = _mm_unpackhi_epi8(
   42|  5.48M|      *x2, *x3);  // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315
   43|       |
   44|  5.48M|  ww0 = _mm_unpacklo_epi16(
   45|  5.48M|      w0, w1);  // 00 10 20 30 01 11 21 31        02 12 22 32 03 13 23 33
   46|  5.48M|  ww1 = _mm_unpackhi_epi16(
   47|  5.48M|      w0, w1);  // 04 14 24 34 05 15 25 35        06 16 26 36 07 17 27 37
   48|  5.48M|  ww2 = _mm_unpacklo_epi16(
   49|  5.48M|      w2, w3);  // 08 18 28 38 09 19 29 39       010 110 210 310 011 111 211 311
   50|  5.48M|  ww3 = _mm_unpackhi_epi16(
   51|  5.48M|      w2,
   52|  5.48M|      w3);  // 012 112 212 312 013 113 213 313  014 114 214 314 015 115 215 315
   53|       |
   54|  5.48M|  *q7p7 = _mm_unpacklo_epi32(
   55|  5.48M|      ww0,
   56|  5.48M|      _mm_srli_si128(
   57|  5.48M|          ww3, 12));  // 00 10 20 30  015 115 215 315  xx xx xx xx xx xx xx xx
   58|  5.48M|  *q6p6 = _mm_unpackhi_epi32(
   59|  5.48M|      _mm_slli_si128(ww0, 4),
   60|  5.48M|      ww3);  // 01 11 21 31  014 114 214 314  xx xx xx xxxx xx xx xx
   61|  5.48M|  *q5p5 = _mm_unpackhi_epi32(
   62|  5.48M|      ww0,
   63|  5.48M|      _mm_slli_si128(
   64|  5.48M|          ww3, 4));  // 02 12 22 32  013 113 213 313  xx xx xx x xx xx xx xxx
   65|  5.48M|  *q4p4 = _mm_unpacklo_epi32(
   66|  5.48M|      _mm_srli_si128(ww0, 12),
   67|  5.48M|      ww3);  // 03 13 23 33  012 112 212 312 xx xx xx xx xx xx xx xx
   68|  5.48M|  *q3p3 = _mm_unpacklo_epi32(
   69|  5.48M|      ww1,
   70|  5.48M|      _mm_srli_si128(
   71|  5.48M|          ww2, 12));  // 04 14 24 34  011 111 211 311 xx xx xx xx xx xx xx xx
   72|  5.48M|  *q2p2 = _mm_unpackhi_epi32(
   73|  5.48M|      _mm_slli_si128(ww1, 4),
   74|  5.48M|      ww2);  // 05 15 25 35   010 110 210 310 xx xx xx xx xx xx xx xx
   75|  5.48M|  *q1p1 = _mm_unpackhi_epi32(
   76|  5.48M|      ww1,
   77|  5.48M|      _mm_slli_si128(
   78|  5.48M|          ww2, 4));  // 06 16 26 36   09 19 29 39     xx xx xx xx xx xx xx xx
   79|  5.48M|  *q0p0 = _mm_unpacklo_epi32(
   80|  5.48M|      _mm_srli_si128(ww1, 12),
   81|  5.48M|      ww2);  // 07 17 27 37  08 18 28 38     xx xx xx xx xx xx xx xx
   82|  5.48M|}
loopfilter_sse2.c:transpose_pq_14_inv_sse2:
   92|  3.82M|                                            __m128i *pq2, __m128i *pq3) {
   93|  3.82M|  __m128i w10, w11, w12, w13;
   94|  3.82M|  __m128i w0, w1, w2, w3, w4, w5;
   95|  3.82M|  __m128i d0, d1, d2, d3;
   96|       |
   97|  3.82M|  w0 = _mm_unpacklo_epi8(
   98|  3.82M|      *x0, *x1);  // p 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
   99|  3.82M|  w1 = _mm_unpacklo_epi8(
  100|  3.82M|      *x2, *x3);  // p 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
  101|  3.82M|  w2 = _mm_unpacklo_epi8(
  102|  3.82M|      *x4, *x5);  // p 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
  103|  3.82M|  w3 = _mm_unpacklo_epi8(
  104|  3.82M|      *x6, *x7);  // p 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
  105|       |
  106|  3.82M|  w4 = _mm_unpacklo_epi16(
  107|  3.82M|      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
  108|  3.82M|  w5 = _mm_unpacklo_epi16(
  109|  3.82M|      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
  110|       |
  111|  3.82M|  d0 = _mm_unpacklo_epi32(
  112|  3.82M|      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
  113|  3.82M|  d2 = _mm_unpackhi_epi32(
  114|  3.82M|      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
  115|       |
  116|  3.82M|  w10 = _mm_unpacklo_epi8(
  117|  3.82M|      *x7, *x6);  // q xx xx xx xx xx xx xx xx 00 10 01 11 02 12 03 13
  118|  3.82M|  w11 = _mm_unpacklo_epi8(
  119|  3.82M|      *x5, *x4);  // q  xx xx xx xx xx xx xx xx 20 30 21 31 22 32 23 33
  120|  3.82M|  w12 = _mm_unpacklo_epi8(
  121|  3.82M|      *x3, *x2);  // q  xx xx xx xx xx xx xx xx 40 50 41 51 42 52 43 53
  122|  3.82M|  w13 = _mm_unpacklo_epi8(
  123|  3.82M|      *x1, *x0);  // q  xx xx xx xx xx xx xx xx 60 70 61 71 62 72 63 73
  124|       |
  125|  3.82M|  w4 = _mm_unpackhi_epi16(
  126|  3.82M|      w10, w11);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
  127|  3.82M|  w5 = _mm_unpackhi_epi16(
  128|  3.82M|      w12, w13);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
  129|       |
  130|  3.82M|  d1 = _mm_unpacklo_epi32(
  131|  3.82M|      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
  132|  3.82M|  d3 = _mm_unpackhi_epi32(
  133|  3.82M|      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
  134|       |
  135|  3.82M|  *pq0 = _mm_unpacklo_epi64(d0, d1);  // pq
  136|  3.82M|  *pq1 = _mm_unpackhi_epi64(d0, d1);  // pq
  137|  3.82M|  *pq2 = _mm_unpacklo_epi64(d2, d3);  // pq
  138|  3.82M|  *pq3 = _mm_unpackhi_epi64(d2, d3);  // pq
  139|  3.82M|}
loopfilter_sse2.c:abs_diff:
   21|  80.2M|static inline __m128i abs_diff(__m128i a, __m128i b) {
   22|  80.2M|  return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
   23|  80.2M|}

loopfilter_sse2.c:transpose4x8_8x4_low_sse2:
  222|  4.48M|                                             __m128i *d2, __m128i *d3) {
  223|       |  // input
  224|       |  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
  225|       |  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
  226|       |  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
  227|       |  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
  228|       |  // output
  229|       |  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
  230|       |  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
  231|       |  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
  232|       |  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
  233|       |
  234|  4.48M|  __m128i w0, w1;
  235|       |
  236|  4.48M|  w0 = _mm_unpacklo_epi8(
  237|  4.48M|      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
  238|  4.48M|  w1 = _mm_unpacklo_epi8(
  239|  4.48M|      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
  240|       |
  241|  4.48M|  *d0 = _mm_unpacklo_epi16(
  242|  4.48M|      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
  243|       |
  244|  4.48M|  *d1 = _mm_srli_si128(*d0,
  245|  4.48M|                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
  246|  4.48M|  *d2 = _mm_srli_si128(*d0,
  247|  4.48M|                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
  248|  4.48M|  *d3 = _mm_srli_si128(*d0,
  249|  4.48M|                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
  250|  4.48M|}
loopfilter_sse2.c:transpose8x8_low_sse2:
  305|   495k|                                         __m128i *d3) {
  306|       |  // input
  307|       |  // x0 00 01 02 03 04 05 06 07
  308|       |  // x1 10 11 12 13 14 15 16 17
  309|       |  // x2 20 21 22 23 24 25 26 27
  310|       |  // x3 30 31 32 33 34 35 36 37
  311|       |  // x4 40 41 42 43 44 45 46 47
  312|       |  // x5  50 51 52 53 54 55 56 57
  313|       |  // x6  60 61 62 63 64 65 66 67
  314|       |  // x7 70 71 72 73 74 75 76 77
  315|       |  // output
  316|       |  // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
  317|       |  // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
  318|       |  // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
  319|       |  // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
  320|       |
  321|   495k|  __m128i w0, w1, w2, w3, w4, w5;
  322|       |
  323|   495k|  w0 = _mm_unpacklo_epi8(
  324|   495k|      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
  325|       |
  326|   495k|  w1 = _mm_unpacklo_epi8(
  327|   495k|      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
  328|       |
  329|   495k|  w2 = _mm_unpacklo_epi8(
  330|   495k|      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
  331|       |
  332|   495k|  w3 = _mm_unpacklo_epi8(
  333|   495k|      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
  334|       |
  335|   495k|  w4 = _mm_unpacklo_epi16(
  336|   495k|      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
  337|   495k|  w5 = _mm_unpacklo_epi16(
  338|   495k|      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
  339|       |
  340|   495k|  *d0 = _mm_unpacklo_epi32(
  341|   495k|      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
  342|   495k|  *d1 = _mm_srli_si128(*d0, 8);
  343|   495k|  *d2 = _mm_unpackhi_epi32(
  344|   495k|      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
  345|   495k|  *d3 = _mm_srli_si128(*d2, 8);
  346|   495k|}
loopfilter_sse2.c:transpose4x8_8x4_sse2:
  256|  5.06M|                                         __m128i *d7) {
  257|       |  // input
  258|       |  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
  259|       |  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
  260|       |  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
  261|       |  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
  262|       |  // output
  263|       |  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
  264|       |  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
  265|       |  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
  266|       |  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
  267|       |  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
  268|       |  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
  269|       |  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
  270|       |  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
  271|       |
  272|  5.06M|  __m128i w0, w1, ww0, ww1;
  273|       |
  274|  5.06M|  w0 = _mm_unpacklo_epi8(
  275|  5.06M|      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
  276|  5.06M|  w1 = _mm_unpacklo_epi8(
  277|  5.06M|      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
  278|       |
  279|  5.06M|  ww0 = _mm_unpacklo_epi16(
  280|  5.06M|      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
  281|  5.06M|  ww1 = _mm_unpackhi_epi16(
  282|  5.06M|      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
  283|       |
  284|  5.06M|  *d0 = ww0;  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
  285|  5.06M|  *d1 = _mm_srli_si128(ww0,
  286|  5.06M|                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
  287|  5.06M|  *d2 = _mm_srli_si128(ww0,
  288|  5.06M|                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
  289|  5.06M|  *d3 = _mm_srli_si128(ww0,
  290|  5.06M|                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
  291|       |
  292|  5.06M|  *d4 = ww1;  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
  293|  5.06M|  *d5 = _mm_srli_si128(ww1,
  294|  5.06M|                       4);  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
  295|  5.06M|  *d6 = _mm_srli_si128(ww1,
  296|  5.06M|                       8);  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
  297|  5.06M|  *d7 = _mm_srli_si128(ww1,
  298|  5.06M|                       12);  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
  299|  5.06M|}
highbd_loopfilter_sse2.c:highbd_transpose4x8_8x4_low_sse2:
   65|  71.7M|                                                    __m128i *d2, __m128i *d3) {
   66|  71.7M|  __m128i zero = _mm_setzero_si128();
   67|  71.7M|  __m128i w0, w1, ww0, ww1;
   68|       |
   69|  71.7M|  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
   70|  71.7M|  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
   71|       |
   72|  71.7M|  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
   73|  71.7M|  ww1 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
   74|       |
   75|  71.7M|  *d0 = _mm_unpacklo_epi64(ww0, zero);  // 00 10 20 30 xx xx xx xx
   76|  71.7M|  *d1 = _mm_unpackhi_epi64(ww0, zero);  // 01 11 21 31 xx xx xx xx
   77|  71.7M|  *d2 = _mm_unpacklo_epi64(ww1, zero);  // 02 12 22 32 xx xx xx xx
   78|  71.7M|  *d3 = _mm_unpackhi_epi64(ww1, zero);  // 03 13 23 33 xx xx xx xx
   79|  71.7M|}
highbd_loopfilter_sse2.c:highbd_transpose8x8_low_sse2:
  131|  10.5M|                                                __m128i *d2, __m128i *d3) {
  132|  10.5M|  __m128i w0, w1, w2, w3, ww0, ww1;
  133|       |  // x0 00 01 02 03 04 05 06 07
  134|       |  // x1 10 11 12 13 14 15 16 17
  135|       |  // x2 20 21 22 23 24 25 26 27
  136|       |  // x3 30 31 32 33 34 35 36 37
  137|       |  // x4 40 41 42 43 44 45 46 47
  138|       |  // x5 50 51 52 53 54 55 56 57
  139|       |  // x6 60 61 62 63 64 65 66 67
  140|       |  // x7 70 71 72 73 74 75 76 77
  141|       |
  142|  10.5M|  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
  143|  10.5M|  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
  144|  10.5M|  w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
  145|  10.5M|  w3 = _mm_unpacklo_epi16(*x6, *x7);  // 60 70 61 71 62 72 63 73
  146|       |
  147|  10.5M|  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
  148|  10.5M|  ww1 = _mm_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
  149|       |
  150|  10.5M|  *d0 = _mm_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
  151|  10.5M|  *d1 = _mm_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
  152|       |
  153|  10.5M|  ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
  154|  10.5M|  ww1 = _mm_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
  155|       |
  156|  10.5M|  *d2 = _mm_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
  157|  10.5M|  *d3 = _mm_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
  158|  10.5M|}
highbd_loopfilter_sse2.c:highbd_transpose4x8_8x4_sse2:
  107|  15.9M|                                                __m128i *d6, __m128i *d7) {
  108|       |  // input
  109|       |  // x0 00 01 02 03 04 05 06 07
  110|       |  // x1 10 11 12 13 14 15 16 17
  111|       |  // x2 20 21 22 23 24 25 26 27
  112|       |  // x3 30 31 32 33 34 35 36 37
  113|       |  // output
  114|       |  // 00 10 20 30 xx xx xx xx
  115|       |  // 01 11 21 31 xx xx xx xx
  116|       |  // 02 12 22 32 xx xx xx xx
  117|       |  // 03 13 23 33 xx xx xx xx
  118|       |  // 04 14 24 34 xx xx xx xx
  119|       |  // 05 15 25 35 xx xx xx xx
  120|       |  // 06 16 26 36 xx xx xx xx
  121|       |  // 07 17 27 37 xx xx xx xx
  122|  15.9M|  highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3);
  123|  15.9M|  highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7);
  124|  15.9M|}
highbd_loopfilter_sse2.c:highbd_transpose4x8_8x4_high_sse2:
   84|  16.9M|                                                     __m128i *d6, __m128i *d7) {
   85|  16.9M|  __m128i w0, w1, ww2, ww3;
   86|  16.9M|  __m128i zero = _mm_setzero_si128();
   87|       |
   88|  16.9M|  w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
   89|  16.9M|  w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
   90|       |
   91|  16.9M|  ww2 = _mm_unpacklo_epi32(w0, w1);  //  04 14 24 34 05 15 25 35
   92|  16.9M|  ww3 = _mm_unpackhi_epi32(w0, w1);  //  06 16 26 36 07 17 27 37
   93|       |
   94|  16.9M|  *d4 = _mm_unpacklo_epi64(ww2, zero);  // 04 14 24 34 xx xx xx xx
   95|  16.9M|  *d5 = _mm_unpackhi_epi64(ww2, zero);  // 05 15 25 35 xx xx xx xx
   96|  16.9M|  *d6 = _mm_unpacklo_epi64(ww3, zero);  // 06 16 26 36 xx xx xx xx
   97|  16.9M|  *d7 = _mm_unpackhi_epi64(ww3, zero);  // 07 17 27 37 xx xx xx xx
   98|  16.9M|}
intrapred_avx2.c:highbd_transpose4x8_8x4_low_sse2:
   65|   261k|                                                    __m128i *d2, __m128i *d3) {
   66|   261k|  __m128i zero = _mm_setzero_si128();
   67|   261k|  __m128i w0, w1, ww0, ww1;
   68|       |
   69|   261k|  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
   70|   261k|  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
   71|       |
   72|   261k|  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
   73|   261k|  ww1 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
   74|       |
   75|   261k|  *d0 = _mm_unpacklo_epi64(ww0, zero);  // 00 10 20 30 xx xx xx xx
   76|   261k|  *d1 = _mm_unpackhi_epi64(ww0, zero);  // 01 11 21 31 xx xx xx xx
   77|   261k|  *d2 = _mm_unpacklo_epi64(ww1, zero);  // 02 12 22 32 xx xx xx xx
   78|   261k|  *d3 = _mm_unpackhi_epi64(ww1, zero);  // 03 13 23 33 xx xx xx xx
   79|   261k|}
intrapred_avx2.c:highbd_transpose8x8_sse2:
  199|   610k|    __m128i *d7) {
  200|   610k|  highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3);
  201|   610k|  highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7);
  202|   610k|}
intrapred_avx2.c:highbd_transpose8x8_low_sse2:
  131|   676k|                                                __m128i *d2, __m128i *d3) {
  132|   676k|  __m128i w0, w1, w2, w3, ww0, ww1;
  133|       |  // x0 00 01 02 03 04 05 06 07
  134|       |  // x1 10 11 12 13 14 15 16 17
  135|       |  // x2 20 21 22 23 24 25 26 27
  136|       |  // x3 30 31 32 33 34 35 36 37
  137|       |  // x4 40 41 42 43 44 45 46 47
  138|       |  // x5 50 51 52 53 54 55 56 57
  139|       |  // x6 60 61 62 63 64 65 66 67
  140|       |  // x7 70 71 72 73 74 75 76 77
  141|       |
  142|   676k|  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
  143|   676k|  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
  144|   676k|  w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
  145|   676k|  w3 = _mm_unpacklo_epi16(*x6, *x7);  // 60 70 61 71 62 72 63 73
  146|       |
  147|   676k|  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
  148|   676k|  ww1 = _mm_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
  149|       |
  150|   676k|  *d0 = _mm_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
  151|   676k|  *d1 = _mm_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
  152|       |
  153|   676k|  ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
  154|   676k|  ww1 = _mm_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
  155|       |
  156|   676k|  *d2 = _mm_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
  157|   676k|  *d3 = _mm_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
  158|   676k|}
intrapred_avx2.c:highbd_transpose8x8_high_sse2:
  165|   610k|                                                 __m128i *d6, __m128i *d7) {
  166|   610k|  __m128i w0, w1, w2, w3, ww0, ww1;
  167|       |  // x0 00 01 02 03 04 05 06 07
  168|       |  // x1 10 11 12 13 14 15 16 17
  169|       |  // x2 20 21 22 23 24 25 26 27
  170|       |  // x3 30 31 32 33 34 35 36 37
  171|       |  // x4 40 41 42 43 44 45 46 47
  172|       |  // x5 50 51 52 53 54 55 56 57
  173|       |  // x6 60 61 62 63 64 65 66 67
  174|       |  // x7 70 71 72 73 74 75 76 77
  175|   610k|  w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
  176|   610k|  w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
  177|   610k|  w2 = _mm_unpackhi_epi16(*x4, *x5);  // 44 54 45 55 46 56 47 57
  178|   610k|  w3 = _mm_unpackhi_epi16(*x6, *x7);  // 64 74 65 75 66 76 67 77
  179|       |
  180|   610k|  ww0 = _mm_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
  181|   610k|  ww1 = _mm_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
  182|       |
  183|   610k|  *d4 = _mm_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
  184|   610k|  *d5 = _mm_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
  185|       |
  186|   610k|  ww0 = _mm_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
  187|   610k|  ww1 = _mm_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
  188|       |
  189|   610k|  *d6 = _mm_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
  190|   610k|  *d7 = _mm_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
  191|   610k|}
intrapred_avx2.c:highbd_transpose4x8_8x4_sse2:
  107|  35.7k|                                                __m128i *d6, __m128i *d7) {
  108|       |  // input
  109|       |  // x0 00 01 02 03 04 05 06 07
  110|       |  // x1 10 11 12 13 14 15 16 17
  111|       |  // x2 20 21 22 23 24 25 26 27
  112|       |  // x3 30 31 32 33 34 35 36 37
  113|       |  // output
  114|       |  // 00 10 20 30 xx xx xx xx
  115|       |  // 01 11 21 31 xx xx xx xx
  116|       |  // 02 12 22 32 xx xx xx xx
  117|       |  // 03 13 23 33 xx xx xx xx
  118|       |  // 04 14 24 34 xx xx xx xx
  119|       |  // 05 15 25 35 xx xx xx xx
  120|       |  // 06 16 26 36 xx xx xx xx
  121|       |  // 07 17 27 37 xx xx xx xx
  122|  35.7k|  highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3);
  123|  35.7k|  highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7);
  124|  35.7k|}
intrapred_avx2.c:highbd_transpose4x8_8x4_high_sse2:
   84|  35.7k|                                                     __m128i *d6, __m128i *d7) {
   85|  35.7k|  __m128i w0, w1, ww2, ww3;
   86|  35.7k|  __m128i zero = _mm_setzero_si128();
   87|       |
   88|  35.7k|  w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
   89|  35.7k|  w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
   90|       |
   91|  35.7k|  ww2 = _mm_unpacklo_epi32(w0, w1);  //  04 14 24 34 05 15 25 35
   92|  35.7k|  ww3 = _mm_unpackhi_epi32(w0, w1);  //  06 16 26 36 07 17 27 37
   93|       |
   94|  35.7k|  *d4 = _mm_unpacklo_epi64(ww2, zero);  // 04 14 24 34 xx xx xx xx
   95|  35.7k|  *d5 = _mm_unpackhi_epi64(ww2, zero);  // 05 15 25 35 xx xx xx xx
   96|  35.7k|  *d6 = _mm_unpacklo_epi64(ww3, zero);  // 06 16 26 36 xx xx xx xx
   97|  35.7k|  *d7 = _mm_unpackhi_epi64(ww3, zero);  // 07 17 27 37 xx xx xx xx
   98|  35.7k|}
intrapred_avx2.c:transpose4x8_8x4_low_sse2:
  222|   115k|                                             __m128i *d2, __m128i *d3) {
  223|       |  // input
  224|       |  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
  225|       |  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
  226|       |  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
  227|       |  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
  228|       |  // output
  229|       |  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
  230|       |  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
  231|       |  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
  232|       |  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
  233|       |
  234|   115k|  __m128i w0, w1;
  235|       |
  236|   115k|  w0 = _mm_unpacklo_epi8(
  237|   115k|      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
  238|   115k|  w1 = _mm_unpacklo_epi8(
  239|   115k|      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
  240|       |
  241|   115k|  *d0 = _mm_unpacklo_epi16(
  242|   115k|      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
  243|       |
  244|   115k|  *d1 = _mm_srli_si128(*d0,
  245|   115k|                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
  246|   115k|  *d2 = _mm_srli_si128(*d0,
  247|   115k|                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
  248|   115k|  *d3 = _mm_srli_si128(*d0,
  249|   115k|                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
  250|   115k|}
intrapred_avx2.c:transpose8x8_sse2:
  352|  98.0k|                                     __m128i *d6d7) {
  353|  98.0k|  __m128i w0, w1, w2, w3, w4, w5, w6, w7;
  354|       |  // x0 00 01 02 03 04 05 06 07
  355|       |  // x1 10 11 12 13 14 15 16 17
  356|  98.0k|  w0 = _mm_unpacklo_epi8(
  357|  98.0k|      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
  358|       |
  359|       |  // x2 20 21 22 23 24 25 26 27
  360|       |  // x3 30 31 32 33 34 35 36 37
  361|  98.0k|  w1 = _mm_unpacklo_epi8(
  362|  98.0k|      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
  363|       |
  364|       |  // x4 40 41 42 43 44 45 46 47
  365|       |  // x5  50 51 52 53 54 55 56 57
  366|  98.0k|  w2 = _mm_unpacklo_epi8(
  367|  98.0k|      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
  368|       |
  369|       |  // x6  60 61 62 63 64 65 66 67
  370|       |  // x7 70 71 72 73 74 75 76 77
  371|  98.0k|  w3 = _mm_unpacklo_epi8(
  372|  98.0k|      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
  373|       |
  374|  98.0k|  w4 = _mm_unpacklo_epi16(
  375|  98.0k|      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
  376|  98.0k|  w5 = _mm_unpacklo_epi16(
  377|  98.0k|      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
  378|       |
  379|  98.0k|  *d0d1 = _mm_unpacklo_epi32(
  380|  98.0k|      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
  381|  98.0k|  *d2d3 = _mm_unpackhi_epi32(
  382|  98.0k|      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
  383|       |
  384|  98.0k|  w6 = _mm_unpackhi_epi16(
  385|  98.0k|      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
  386|  98.0k|  w7 = _mm_unpackhi_epi16(
  387|  98.0k|      w2, w3);  // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
  388|       |
  389|  98.0k|  *d4d5 = _mm_unpacklo_epi32(
  390|  98.0k|      w6, w7);  // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
  391|  98.0k|  *d6d7 = _mm_unpackhi_epi32(
  392|  98.0k|      w6, w7);  // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
  393|  98.0k|}
intrapred_avx2.c:transpose4x8_8x4_sse2:
  256|  23.9k|                                         __m128i *d7) {
  257|       |  // input
  258|       |  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
  259|       |  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
  260|       |  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
  261|       |  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
  262|       |  // output
  263|       |  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
  264|       |  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
  265|       |  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
  266|       |  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
  267|       |  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
  268|       |  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
  269|       |  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
  270|       |  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
  271|       |
  272|  23.9k|  __m128i w0, w1, ww0, ww1;
  273|       |
  274|  23.9k|  w0 = _mm_unpacklo_epi8(
  275|  23.9k|      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
  276|  23.9k|  w1 = _mm_unpacklo_epi8(
  277|  23.9k|      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
  278|       |
  279|  23.9k|  ww0 = _mm_unpacklo_epi16(
  280|  23.9k|      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
  281|  23.9k|  ww1 = _mm_unpackhi_epi16(
  282|  23.9k|      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
  283|       |
  284|  23.9k|  *d0 = ww0;  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
  285|  23.9k|  *d1 = _mm_srli_si128(ww0,
  286|  23.9k|                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
  287|  23.9k|  *d2 = _mm_srli_si128(ww0,
  288|  23.9k|                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
  289|  23.9k|  *d3 = _mm_srli_si128(ww0,
  290|  23.9k|                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
  291|       |
  292|  23.9k|  *d4 = ww1;  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
  293|  23.9k|  *d5 = _mm_srli_si128(ww1,
  294|  23.9k|                       4);  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
  295|  23.9k|  *d6 = _mm_srli_si128(ww1,
  296|  23.9k|                       8);  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
  297|  23.9k|  *d7 = _mm_srli_si128(ww1,
  298|  23.9k|                       12);  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
  299|  23.9k|}
intrapred_avx2.c:transpose8x16_16x8_sse2:
  451|  26.2k|    __m128i *d12d13, __m128i *d14d15) {
  452|  26.2k|  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
  453|  26.2k|  __m128i w10, w11, w12, w13, w14, w15;
  454|       |
  455|  26.2k|  w0 = _mm_unpacklo_epi8(*x0, *x1);
  456|  26.2k|  w1 = _mm_unpacklo_epi8(*x2, *x3);
  457|  26.2k|  w2 = _mm_unpacklo_epi8(*x4, *x5);
  458|  26.2k|  w3 = _mm_unpacklo_epi8(*x6, *x7);
  459|       |
  460|  26.2k|  w8 = _mm_unpackhi_epi8(*x0, *x1);
  461|  26.2k|  w9 = _mm_unpackhi_epi8(*x2, *x3);
  462|  26.2k|  w10 = _mm_unpackhi_epi8(*x4, *x5);
  463|  26.2k|  w11 = _mm_unpackhi_epi8(*x6, *x7);
  464|       |
  465|  26.2k|  w4 = _mm_unpacklo_epi16(w0, w1);
  466|  26.2k|  w5 = _mm_unpacklo_epi16(w2, w3);
  467|  26.2k|  w12 = _mm_unpacklo_epi16(w8, w9);
  468|  26.2k|  w13 = _mm_unpacklo_epi16(w10, w11);
  469|       |
  470|  26.2k|  w6 = _mm_unpacklo_epi32(w4, w5);
  471|  26.2k|  w7 = _mm_unpackhi_epi32(w4, w5);
  472|  26.2k|  w14 = _mm_unpacklo_epi32(w12, w13);
  473|  26.2k|  w15 = _mm_unpackhi_epi32(w12, w13);
  474|       |
  475|       |  // Store first 4-line result
  476|  26.2k|  *d0d1 = _mm_unpacklo_epi64(w6, w14);
  477|  26.2k|  *d2d3 = _mm_unpackhi_epi64(w6, w14);
  478|  26.2k|  *d4d5 = _mm_unpacklo_epi64(w7, w15);
  479|  26.2k|  *d6d7 = _mm_unpackhi_epi64(w7, w15);
  480|       |
  481|  26.2k|  w4 = _mm_unpackhi_epi16(w0, w1);
  482|  26.2k|  w5 = _mm_unpackhi_epi16(w2, w3);
  483|  26.2k|  w12 = _mm_unpackhi_epi16(w8, w9);
  484|  26.2k|  w13 = _mm_unpackhi_epi16(w10, w11);
  485|       |
  486|  26.2k|  w6 = _mm_unpacklo_epi32(w4, w5);
  487|  26.2k|  w7 = _mm_unpackhi_epi32(w4, w5);
  488|  26.2k|  w14 = _mm_unpacklo_epi32(w12, w13);
  489|  26.2k|  w15 = _mm_unpackhi_epi32(w12, w13);
  490|       |
  491|       |  // Store second 4-line result
  492|  26.2k|  *d8d9 = _mm_unpacklo_epi64(w6, w14);
  493|  26.2k|  *d10d11 = _mm_unpackhi_epi64(w6, w14);
  494|  26.2k|  *d12d13 = _mm_unpacklo_epi64(w7, w15);
  495|  26.2k|  *d14d15 = _mm_unpackhi_epi64(w7, w15);
  496|  26.2k|}
intrapred_avx2.c:transpose8x8_low_sse2:
  305|  40.7k|                                         __m128i *d3) {
  306|       |  // input
  307|       |  // x0 00 01 02 03 04 05 06 07
  308|       |  // x1 10 11 12 13 14 15 16 17
  309|       |  // x2 20 21 22 23 24 25 26 27
  310|       |  // x3 30 31 32 33 34 35 36 37
  311|       |  // x4 40 41 42 43 44 45 46 47
  312|       |  // x5  50 51 52 53 54 55 56 57
  313|       |  // x6  60 61 62 63 64 65 66 67
  314|       |  // x7 70 71 72 73 74 75 76 77
  315|       |  // output
  316|       |  // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
  317|       |  // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
  318|       |  // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
  319|       |  // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
  320|       |
  321|  40.7k|  __m128i w0, w1, w2, w3, w4, w5;
  322|       |
  323|  40.7k|  w0 = _mm_unpacklo_epi8(
  324|  40.7k|      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
  325|       |
  326|  40.7k|  w1 = _mm_unpacklo_epi8(
  327|  40.7k|      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
  328|       |
  329|  40.7k|  w2 = _mm_unpacklo_epi8(
  330|  40.7k|      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
  331|       |
  332|  40.7k|  w3 = _mm_unpacklo_epi8(
  333|  40.7k|      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
  334|       |
  335|  40.7k|  w4 = _mm_unpacklo_epi16(
  336|  40.7k|      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
  337|  40.7k|  w5 = _mm_unpacklo_epi16(
  338|  40.7k|      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
  339|       |
  340|  40.7k|  *d0 = _mm_unpacklo_epi32(
  341|  40.7k|      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
  342|  40.7k|  *d1 = _mm_srli_si128(*d0, 8);
  343|  40.7k|  *d2 = _mm_unpackhi_epi32(
  344|  40.7k|      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
  345|  40.7k|  *d3 = _mm_srli_si128(*d2, 8);
  346|  40.7k|}
intrapred_avx2.c:transpose16x8_8x16_sse2:
  400|   195k|    __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) {
  401|   195k|  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
  402|   195k|  __m128i w10, w11, w12, w13, w14, w15;
  403|       |
  404|   195k|  w0 = _mm_unpacklo_epi8(*x0, *x1);
  405|   195k|  w1 = _mm_unpacklo_epi8(*x2, *x3);
  406|   195k|  w2 = _mm_unpacklo_epi8(*x4, *x5);
  407|   195k|  w3 = _mm_unpacklo_epi8(*x6, *x7);
  408|       |
  409|   195k|  w8 = _mm_unpacklo_epi8(*x8, *x9);
  410|   195k|  w9 = _mm_unpacklo_epi8(*x10, *x11);
  411|   195k|  w10 = _mm_unpacklo_epi8(*x12, *x13);
  412|   195k|  w11 = _mm_unpacklo_epi8(*x14, *x15);
  413|       |
  414|   195k|  w4 = _mm_unpacklo_epi16(w0, w1);
  415|   195k|  w5 = _mm_unpacklo_epi16(w2, w3);
  416|   195k|  w12 = _mm_unpacklo_epi16(w8, w9);
  417|   195k|  w13 = _mm_unpacklo_epi16(w10, w11);
  418|       |
  419|   195k|  w6 = _mm_unpacklo_epi32(w4, w5);
  420|   195k|  w7 = _mm_unpackhi_epi32(w4, w5);
  421|   195k|  w14 = _mm_unpacklo_epi32(w12, w13);
  422|   195k|  w15 = _mm_unpackhi_epi32(w12, w13);
  423|       |
  424|       |  // Store first 4-line result
  425|   195k|  *d0 = _mm_unpacklo_epi64(w6, w14);
  426|   195k|  *d1 = _mm_unpackhi_epi64(w6, w14);
  427|   195k|  *d2 = _mm_unpacklo_epi64(w7, w15);
  428|   195k|  *d3 = _mm_unpackhi_epi64(w7, w15);
  429|       |
  430|   195k|  w4 = _mm_unpackhi_epi16(w0, w1);
  431|   195k|  w5 = _mm_unpackhi_epi16(w2, w3);
  432|   195k|  w12 = _mm_unpackhi_epi16(w8, w9);
  433|   195k|  w13 = _mm_unpackhi_epi16(w10, w11);
  434|       |
  435|   195k|  w6 = _mm_unpacklo_epi32(w4, w5);
  436|   195k|  w7 = _mm_unpackhi_epi32(w4, w5);
  437|   195k|  w14 = _mm_unpacklo_epi32(w12, w13);
  438|   195k|  w15 = _mm_unpackhi_epi32(w12, w13);
  439|       |
  440|       |  // Store second 4-line result
  441|   195k|  *d4 = _mm_unpacklo_epi64(w6, w14);
  442|   195k|  *d5 = _mm_unpackhi_epi64(w6, w14);
  443|   195k|  *d6 = _mm_unpacklo_epi64(w7, w15);
  444|   195k|  *d7 = _mm_unpackhi_epi64(w7, w15);
  445|   195k|}

convolve_2d_avx2.c:loadu_int32:
   28|  1.00M|static inline int32_t loadu_int32(const void *src) {
   29|  1.00M|  int32_t v;
   30|  1.00M|  memcpy(&v, src, sizeof(v));
   31|  1.00M|  return v;
   32|  1.00M|}
convolve_2d_avx2.c:load_8bit_8x2_to_1_reg_sse2:
   58|  4.28M|                                                  const int byte_stride) {
   59|  4.28M|  __m128i dst;
   60|  4.28M|  dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride));
   61|  4.28M|  dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst);
   62|  4.28M|  return dst;
   63|  4.28M|}
convolve_2d_avx2.c:loadh_epi64:
   44|  4.28M|static inline __m128i loadh_epi64(const void *const src, const __m128i s) {
   45|  4.28M|  return _mm_castps_si128(
   46|  4.28M|      _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
   47|  4.28M|}
convolve_avx2.c:loadu_int32:
   28|  2.69M|static inline int32_t loadu_int32(const void *src) {
   29|  2.69M|  int32_t v;
   30|  2.69M|  memcpy(&v, src, sizeof(v));
   31|  2.69M|  return v;
   32|  2.69M|}
convolve_avx2.c:_mm_storeh_epi64:
   40|  51.3k|static inline void _mm_storeh_epi64(__m128i *const d, const __m128i s) {
   41|  51.3k|  _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s));
   42|  51.3k|}
convolve_avx2.c:loadu_int16:
   22|   462k|static inline int16_t loadu_int16(const void *src) {
   23|   462k|  int16_t v;
   24|   462k|  memcpy(&v, src, sizeof(v));
   25|   462k|  return v;
   26|   462k|}
convolve_avx2.c:load_8bit_8x2_to_1_reg_sse2:
   58|  1.11M|                                                  const int byte_stride) {
   59|  1.11M|  __m128i dst;
   60|  1.11M|  dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride));
   61|  1.11M|  dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst);
   62|  1.11M|  return dst;
   63|  1.11M|}
convolve_avx2.c:loadh_epi64:
   44|  1.11M|static inline __m128i loadh_epi64(const void *const src, const __m128i s) {
   45|  1.11M|  return _mm_castps_si128(
   46|  1.11M|      _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
   47|  1.11M|}
jnt_convolve_avx2.c:_mm_storeh_epi64:
   40|   285k|static inline void _mm_storeh_epi64(__m128i *const d, const __m128i s) {
   41|   285k|  _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s));
   42|   285k|}
jnt_convolve_avx2.c:loadu_int32:
   28|   210k|static inline int32_t loadu_int32(const void *src) {
   29|   210k|  int32_t v;
   30|   210k|  memcpy(&v, src, sizeof(v));
   31|   210k|  return v;
   32|   210k|}

loopfilter_sse2.c:xx_loadl_32:
   31|  78.2M|static inline __m128i xx_loadl_32(const void *a) {
   32|  78.2M|  int val;
   33|  78.2M|  memcpy(&val, a, sizeof(val));
   34|  78.2M|  return _mm_cvtsi32_si128(val);
   35|  78.2M|}
loopfilter_sse2.c:xx_storel_32:
   57|  67.5M|static inline void xx_storel_32(void *const a, const __m128i v) {
   58|  67.5M|  const int val = _mm_cvtsi128_si32(v);
   59|  67.5M|  memcpy(a, &val, sizeof(val));
   60|  67.5M|}
blend_a64_mask_sse4.c:xx_loadu_128:
   45|  25.3M|static inline __m128i xx_loadu_128(const void *a) {
   46|  25.3M|  return _mm_loadu_si128((const __m128i *)a);
   47|  25.3M|}
blend_a64_mask_sse4.c:xx_storeu_128:
   70|  11.7M|static inline void xx_storeu_128(void *const a, const __m128i v) {
   71|  11.7M|  _mm_storeu_si128((__m128i *)a, v);
   72|  11.7M|}
blend_a64_mask_sse4.c:xx_roundn_epu16:
   88|  13.9M|static inline __m128i xx_roundn_epu16(__m128i v_val_w, int bits) {
   89|  13.9M|  const __m128i v_s_w = _mm_srli_epi16(v_val_w, bits - 1);
   90|  13.9M|  return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
   91|  13.9M|}
blend_a64_mask_sse4.c:xx_loadl_32:
   31|  13.3M|static inline __m128i xx_loadl_32(const void *a) {
   32|  13.3M|  int val;
   33|  13.3M|  memcpy(&val, a, sizeof(val));
   34|  13.3M|  return _mm_cvtsi32_si128(val);
   35|  13.3M|}
blend_a64_mask_sse4.c:xx_storel_32:
   57|  3.41M|static inline void xx_storel_32(void *const a, const __m128i v) {
   58|  3.41M|  const int val = _mm_cvtsi128_si32(v);
   59|  3.41M|  memcpy(a, &val, sizeof(val));
   60|  3.41M|}
blend_a64_mask_sse4.c:xx_loadl_64:
   37|  24.2M|static inline __m128i xx_loadl_64(const void *a) {
   38|  24.2M|  return _mm_loadl_epi64((const __m128i *)a);
   39|  24.2M|}
blend_a64_mask_sse4.c:xx_storel_64:
   62|  5.58M|static inline void xx_storel_64(void *const a, const __m128i v) {
   63|  5.58M|  _mm_storel_epi64((__m128i *)a, v);
   64|  5.58M|}
blend_a64_mask_sse4.c:xx_round_epu16:
   84|   810k|static inline __m128i xx_round_epu16(__m128i v_val_w) {
   85|   810k|  return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
   86|   810k|}
blend_a64_vmask_sse4.c:xx_loadl_64:
   37|  11.0M|static inline __m128i xx_loadl_64(const void *a) {
   38|  11.0M|  return _mm_loadl_epi64((const __m128i *)a);
   39|  11.0M|}
blend_a64_vmask_sse4.c:xx_roundn_epu16:
   88|  9.84M|static inline __m128i xx_roundn_epu16(__m128i v_val_w, int bits) {
   89|  9.84M|  const __m128i v_s_w = _mm_srli_epi16(v_val_w, bits - 1);
   90|  9.84M|  return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
   91|  9.84M|}
blend_a64_vmask_sse4.c:xx_storeu_128:
   70|  6.07M|static inline void xx_storeu_128(void *const a, const __m128i v) {
   71|  6.07M|  _mm_storeu_si128((__m128i *)a, v);
   72|  6.07M|}
blend_a64_vmask_sse4.c:xx_loadl_32:
   31|   648k|static inline __m128i xx_loadl_32(const void *a) {
   32|   648k|  int val;
   33|   648k|  memcpy(&val, a, sizeof(val));
   34|   648k|  return _mm_cvtsi32_si128(val);
   35|   648k|}
blend_a64_vmask_sse4.c:xx_storel_32:
   57|   324k|static inline void xx_storel_32(void *const a, const __m128i v) {
   58|   324k|  const int val = _mm_cvtsi128_si32(v);
   59|   324k|  memcpy(a, &val, sizeof(val));
   60|   324k|}
blend_a64_vmask_sse4.c:xx_storel_64:
   62|  2.06M|static inline void xx_storel_64(void *const a, const __m128i v) {
   63|  2.06M|  _mm_storel_epi64((__m128i *)a, v);
   64|  2.06M|}
blend_a64_vmask_sse4.c:xx_loadu_128:
   45|  8.67M|static inline __m128i xx_loadu_128(const void *a) {
   46|  8.67M|  return _mm_loadu_si128((const __m128i *)a);
   47|  8.67M|}
blend_a64_vmask_sse4.c:xx_round_epu16:
   84|   359k|static inline __m128i xx_round_epu16(__m128i v_val_w) {
   85|   359k|  return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
   86|   359k|}
blend_a64_mask_avx2.c:xx_loadl_32:
   31|  1.45M|static inline __m128i xx_loadl_32(const void *a) {
   32|  1.45M|  int val;
   33|  1.45M|  memcpy(&val, a, sizeof(val));
   34|  1.45M|  return _mm_cvtsi32_si128(val);
   35|  1.45M|}
blend_a64_mask_avx2.c:xx_loadl_64:
   37|  6.28M|static inline __m128i xx_loadl_64(const void *a) {
   38|  6.28M|  return _mm_loadl_epi64((const __m128i *)a);
   39|  6.28M|}
blend_a64_mask_avx2.c:xx_storel_32:
   57|   733k|static inline void xx_storel_32(void *const a, const __m128i v) {
   58|   733k|  const int val = _mm_cvtsi128_si32(v);
   59|   733k|  memcpy(a, &val, sizeof(val));
   60|   733k|}
blend_a64_mask_avx2.c:xx_loadu_128:
   45|  11.2M|static inline __m128i xx_loadu_128(const void *a) {
   46|  11.2M|  return _mm_loadu_si128((const __m128i *)a);
   47|  11.2M|}
blend_a64_mask_avx2.c:xx_roundn_epu16:
   88|   350k|static inline __m128i xx_roundn_epu16(__m128i v_val_w, int bits) {
   89|   350k|  const __m128i v_s_w = _mm_srli_epi16(v_val_w, bits - 1);
   90|   350k|  return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
   91|   350k|}
blend_a64_mask_avx2.c:xx_storel_64:
   62|  2.17M|static inline void xx_storel_64(void *const a, const __m128i v) {
   63|  2.17M|  _mm_storel_epi64((__m128i *)a, v);
   64|  2.17M|}
blend_a64_mask_avx2.c:xx_storeu_128:
   70|  1.49M|static inline void xx_storeu_128(void *const a, const __m128i v) {
   71|  1.49M|  _mm_storeu_si128((__m128i *)a, v);
   72|  1.49M|}
highbd_convolve_avx2.c:xx_storel_32:
   57|   987k|static inline void xx_storel_32(void *const a, const __m128i v) {
   58|   987k|  const int val = _mm_cvtsi128_si32(v);
   59|   987k|  memcpy(a, &val, sizeof(val));
   60|   987k|}
av1_convolve_horiz_rs_sse4.c:xx_loadu_128:
   45|   556M|static inline __m128i xx_loadu_128(const void *a) {
   46|   556M|  return _mm_loadu_si128((const __m128i *)a);
   47|   556M|}
av1_convolve_horiz_rs_sse4.c:xx_loadl_64:
   37|   192M|static inline __m128i xx_loadl_64(const void *a) {
   38|   192M|  return _mm_loadl_epi64((const __m128i *)a);
   39|   192M|}
av1_convolve_horiz_rs_sse4.c:xx_storel_32:
   57|  48.1M|static inline void xx_storel_32(void *const a, const __m128i v) {
   58|  48.1M|  const int val = _mm_cvtsi128_si32(v);
   59|  48.1M|  memcpy(a, &val, sizeof(val));
   60|  48.1M|}
av1_convolve_horiz_rs_sse4.c:xx_storel_64:
   62|   136M|static inline void xx_storel_64(void *const a, const __m128i v) {
   63|   136M|  _mm_storel_epi64((__m128i *)a, v);
   64|   136M|}
filterintra_sse4.c:xx_load_128:
   41|  2.68M|static inline __m128i xx_load_128(const void *a) {
   42|  2.68M|  return _mm_load_si128((const __m128i *)a);
   43|  2.68M|}
filterintra_sse4.c:xx_loadl_64:
   37|   616k|static inline __m128i xx_loadl_64(const void *a) {
   38|   616k|  return _mm_loadl_epi64((const __m128i *)a);
   39|   616k|}
filterintra_sse4.c:xx_loadl_32:
   31|  13.8M|static inline __m128i xx_loadl_32(const void *a) {
   32|  13.8M|  int val;
   33|  13.8M|  memcpy(&val, a, sizeof(val));
   34|  13.8M|  return _mm_cvtsi32_si128(val);
   35|  13.8M|}
filterintra_sse4.c:xx_roundn_epi16_unsigned:
   99|  25.7M|static inline __m128i xx_roundn_epi16_unsigned(__m128i v_val_d, int bits) {
  100|  25.7M|  const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1);
  101|  25.7M|  const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
  102|  25.7M|  return _mm_srai_epi16(v_tmp_d, bits);
  103|  25.7M|}
filterintra_sse4.c:xx_storel_32:
   57|  25.7M|static inline void xx_storel_32(void *const a, const __m128i v) {
   58|  25.7M|  const int val = _mm_cvtsi128_si32(v);
   59|  25.7M|  memcpy(a, &val, sizeof(val));
   60|  25.7M|}
convolve_2d_avx2.c:xx_storel_32:
   57|  4.04M|static inline void xx_storel_32(void *const a, const __m128i v) {
   58|  4.04M|  const int val = _mm_cvtsi128_si32(v);
   59|  4.04M|  memcpy(a, &val, sizeof(val));
   60|  4.04M|}
convolve_avx2.c:xx_storel_32:
   57|  1.80M|static inline void xx_storel_32(void *const a, const __m128i v) {
   58|  1.80M|  const int val = _mm_cvtsi128_si32(v);
   59|  1.80M|  memcpy(a, &val, sizeof(val));
   60|  1.80M|}
selfguided_avx2.c:xx_loadl_64:
   37|  77.6M|static inline __m128i xx_loadl_64(const void *a) {
   38|  77.6M|  return _mm_loadl_epi64((const __m128i *)a);
   39|  77.6M|}
selfguided_avx2.c:xx_loadu_128:
   45|   458M|static inline __m128i xx_loadu_128(const void *a) {
   46|   458M|  return _mm_loadu_si128((const __m128i *)a);
   47|   458M|}
selfguided_avx2.c:xx_storeu_128:
   70|  39.4M|static inline void xx_storeu_128(void *const a, const __m128i v) {
   71|  39.4M|  _mm_storeu_si128((__m128i *)a, v);
   72|  39.4M|}
wiener_convolve_avx2.c:xx_loadu_128:
   45|   298k|static inline __m128i xx_loadu_128(const void *a) {
   46|   298k|  return _mm_loadu_si128((const __m128i *)a);
   47|   298k|}
highbd_convolve_2d_avx2.c:xx_storel_32:
   57|  1.74M|static inline void xx_storel_32(void *const a, const __m128i v) {
   58|  1.74M|  const int val = _mm_cvtsi128_si32(v);
   59|  1.74M|  memcpy(a, &val, sizeof(val));
   60|  1.74M|}
highbd_wiener_convolve_avx2.c:xx_loadu_128:
   45|   481k|static inline __m128i xx_loadu_128(const void *a) {
   46|   481k|  return _mm_loadu_si128((const __m128i *)a);
   47|   481k|}

blend_a64_mask_avx2.c:yy_loadu_256:
   34|  22.5M|static inline __m256i yy_loadu_256(const void *a) {
   35|  22.5M|  return _mm256_loadu_si256((const __m256i *)a);
   36|  22.5M|}
blend_a64_mask_avx2.c:yy_roundn_epu16:
   90|   775k|static inline __m256i yy_roundn_epu16(__m256i v_val_w, int bits) {
   91|   775k|  const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1);
   92|   775k|  return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256());
   93|   775k|}
blend_a64_mask_avx2.c:yy_storeu_256:
   42|  5.84M|static inline void yy_storeu_256(void *const a, const __m256i v) {
   43|  5.84M|  _mm256_storeu_si256((__m256i *)a, v);
   44|  5.84M|}
blend_a64_mask_avx2.c:yy_loadu_4x64:
   67|   228k|                                    const void *e1, const void *e0) {
   68|   228k|  __m128d v0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i *)e0));
   69|   228k|  __m128d v01 = _mm_loadh_pd(v0, (const double *)e1);
   70|   228k|  __m128d v2 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i *)e2));
   71|   228k|  __m128d v23 = _mm_loadh_pd(v2, (const double *)e3);
   72|       |  // Note this can be replaced with
   73|       |  // `_mm256_castpd_si256(_mm256_set_m128d(v23, v01))` if immintrin.h contains
   74|       |  // _mm256_set_m128d() with all supported compilers. This version is used to
   75|       |  // match the behavior with yy_set_m128i().
   76|   228k|  return yy_set_m128i(_mm_castpd_si128(v23), _mm_castpd_si128(v01));
   77|   228k|}
blend_a64_mask_avx2.c:yy_set_m128i:
   59|  1.91M|static inline __m256i yy_set_m128i(__m128i hi, __m128i lo) {
   60|  1.91M|  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
   61|  1.91M|}
blend_a64_mask_avx2.c:yy_loadu2_128:
   79|  1.68M|static inline __m256i yy_loadu2_128(const void *hi, const void *lo) {
   80|  1.68M|  __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
   81|  1.68M|  __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
   82|  1.68M|  return yy_set_m128i(mhi, mlo);
   83|  1.68M|}
blend_a64_mask_avx2.c:yy_storeu2_128:
   85|   538k|static inline void yy_storeu2_128(void *hi, void *lo, const __m256i a) {
   86|   538k|  _mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1));
   87|   538k|  _mm_storeu_si128((__m128i *)lo, _mm256_castsi256_si128(a));
   88|   538k|}
reconinter_avx2.c:yy_storeu_256:
   42|  2.77M|static inline void yy_storeu_256(void *const a, const __m256i v) {
   43|  2.77M|  _mm256_storeu_si256((__m256i *)a, v);
   44|  2.77M|}
reconinter_avx2.c:yy_loadu_256:
   34|  10.7M|static inline __m256i yy_loadu_256(const void *a) {
   35|  10.7M|  return _mm256_loadu_si256((const __m256i *)a);
   36|  10.7M|}
reconinter_avx2.c:yy_set_m128i:
   59|   289k|static inline __m256i yy_set_m128i(__m128i hi, __m128i lo) {
   60|   289k|  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
   61|   289k|}
reconinter_avx2.c:yy_loadu2_128:
   79|   289k|static inline __m256i yy_loadu2_128(const void *hi, const void *lo) {
   80|   289k|  __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
   81|   289k|  __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
   82|   289k|  return yy_set_m128i(mhi, mlo);
   83|   289k|}
selfguided_avx2.c:yy_load_256:
   30|   422M|static inline __m256i yy_load_256(const void *a) {
   31|   422M|  return _mm256_load_si256((const __m256i *)a);
   32|   422M|}
selfguided_avx2.c:yy_store_256:
   38|   404M|static inline void yy_store_256(void *const a, const __m256i v) {
   39|   404M|  _mm256_store_si256((__m256i *)a, v);
   40|   404M|}
selfguided_avx2.c:yy_loadu_256:
   34|  2.69G|static inline __m256i yy_loadu_256(const void *a) {
   35|  2.69G|  return _mm256_loadu_si256((const __m256i *)a);
   36|  2.69G|}
selfguided_avx2.c:yy_storeu_256:
   42|   411M|static inline void yy_storeu_256(void *const a, const __m256i v) {
   43|   411M|  _mm256_storeu_si256((__m256i *)a, v);
   44|   411M|}
highbd_wiener_convolve_avx2.c:yy_set_m128i:
   59|  1.92M|static inline __m256i yy_set_m128i(__m128i hi, __m128i lo) {
   60|  1.92M|  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
   61|  1.92M|}
highbd_wiener_convolve_avx2.c:yy_loadu_256:
   34|   274M|static inline __m256i yy_loadu_256(const void *a) {
   35|   274M|  return _mm256_loadu_si256((const __m256i *)a);
   36|   274M|}
highbd_wiener_convolve_avx2.c:yy_storeu_256:
   42|  36.3M|static inline void yy_storeu_256(void *const a, const __m256i v) {
   43|  36.3M|  _mm256_storeu_si256((__m256i *)a, v);
   44|  36.3M|}

av1_inv_txfm_ssse3.c:transpose_16bit_8x8:
  215|  3.13M|                                       __m128i *const out) {
  216|       |  // Unpack 16 bit elements. Goes from:
  217|       |  // in[0]: 00 01 02 03  04 05 06 07
  218|       |  // in[1]: 10 11 12 13  14 15 16 17
  219|       |  // in[2]: 20 21 22 23  24 25 26 27
  220|       |  // in[3]: 30 31 32 33  34 35 36 37
  221|       |  // in[4]: 40 41 42 43  44 45 46 47
  222|       |  // in[5]: 50 51 52 53  54 55 56 57
  223|       |  // in[6]: 60 61 62 63  64 65 66 67
  224|       |  // in[7]: 70 71 72 73  74 75 76 77
  225|       |  // to:
  226|       |  // a0:    00 10 01 11  02 12 03 13
  227|       |  // a1:    20 30 21 31  22 32 23 33
  228|       |  // a2:    40 50 41 51  42 52 43 53
  229|       |  // a3:    60 70 61 71  62 72 63 73
  230|       |  // a4:    04 14 05 15  06 16 07 17
  231|       |  // a5:    24 34 25 35  26 36 27 37
  232|       |  // a6:    44 54 45 55  46 56 47 57
  233|       |  // a7:    64 74 65 75  66 76 67 77
  234|  3.13M|  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
  235|  3.13M|  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
  236|  3.13M|  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
  237|  3.13M|  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
  238|  3.13M|  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
  239|  3.13M|  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
  240|  3.13M|  const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
  241|  3.13M|  const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
  242|       |
  243|       |  // Unpack 32 bit elements resulting in:
  244|       |  // b0: 00 10 20 30  01 11 21 31
  245|       |  // b1: 40 50 60 70  41 51 61 71
  246|       |  // b2: 04 14 24 34  05 15 25 35
  247|       |  // b3: 44 54 64 74  45 55 65 75
  248|       |  // b4: 02 12 22 32  03 13 23 33
  249|       |  // b5: 42 52 62 72  43 53 63 73
  250|       |  // b6: 06 16 26 36  07 17 27 37
  251|       |  // b7: 46 56 66 76  47 57 67 77
  252|  3.13M|  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
  253|  3.13M|  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
  254|  3.13M|  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
  255|  3.13M|  const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
  256|  3.13M|  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
  257|  3.13M|  const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
  258|  3.13M|  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
  259|  3.13M|  const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
  260|       |
  261|       |  // Unpack 64 bit elements resulting in:
  262|       |  // out[0]: 00 10 20 30  40 50 60 70
  263|       |  // out[1]: 01 11 21 31  41 51 61 71
  264|       |  // out[2]: 02 12 22 32  42 52 62 72
  265|       |  // out[3]: 03 13 23 33  43 53 63 73
  266|       |  // out[4]: 04 14 24 34  44 54 64 74
  267|       |  // out[5]: 05 15 25 35  45 55 65 75
  268|       |  // out[6]: 06 16 26 36  46 56 66 76
  269|       |  // out[7]: 07 17 27 37  47 57 67 77
  270|  3.13M|  out[0] = _mm_unpacklo_epi64(b0, b1);
  271|  3.13M|  out[1] = _mm_unpackhi_epi64(b0, b1);
  272|  3.13M|  out[2] = _mm_unpacklo_epi64(b4, b5);
  273|  3.13M|  out[3] = _mm_unpackhi_epi64(b4, b5);
  274|  3.13M|  out[4] = _mm_unpacklo_epi64(b2, b3);
  275|  3.13M|  out[5] = _mm_unpackhi_epi64(b2, b3);
  276|  3.13M|  out[6] = _mm_unpacklo_epi64(b6, b7);
  277|  3.13M|  out[7] = _mm_unpackhi_epi64(b6, b7);
  278|  3.13M|}
av1_inv_txfm_ssse3.c:transpose_16bit_4x4:
   97|   588k|                                       __m128i *const out) {
   98|       |  // Unpack 16 bit elements. Goes from:
   99|       |  // in[0]: 00 01 02 03  XX XX XX XX
  100|       |  // in[1]: 10 11 12 13  XX XX XX XX
  101|       |  // in[2]: 20 21 22 23  XX XX XX XX
  102|       |  // in[3]: 30 31 32 33  XX XX XX XX
  103|       |  // to:
  104|       |  // a0:    00 10 01 11  02 12 03 13
  105|       |  // a1:    20 30 21 31  22 32 23 33
  106|   588k|  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
  107|   588k|  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
  108|       |
  109|       |  // Unpack 32 bit elements resulting in:
  110|       |  // out[0]: 00 10 20 30  01 11 21 31
  111|       |  // out[1]: 01 11 21 31  __ __ __ __
  112|       |  // out[2]: 02 12 22 32  03 13 23 33
  113|       |  // out[3]: 03 13 23 33  __ __ __ __
  114|       |  //
  115|       |  // Note: The high 64 bits of the output registers are shown for informational
  116|       |  // purposes only. Callers should only use the low 64 bits of the output
  117|       |  // registers. "__" indicates zeros.
  118|   588k|  out[0] = _mm_unpacklo_epi32(a0, a1);
  119|   588k|  out[1] = _mm_srli_si128(out[0], 8);
  120|   588k|  out[2] = _mm_unpackhi_epi32(a0, a1);
  121|   588k|  out[3] = _mm_srli_si128(out[2], 8);
  122|   588k|}
av1_inv_txfm_ssse3.c:transpose_16bit_8x4:
  167|   912k|                                       __m128i *const out) {
  168|       |  // Unpack 16 bit elements. Goes from:
  169|       |  // in[0]: 00 01 02 03  04 05 06 07
  170|       |  // in[1]: 10 11 12 13  14 15 16 17
  171|       |  // in[2]: 20 21 22 23  24 25 26 27
  172|       |  // in[3]: 30 31 32 33  34 35 36 37
  173|       |
  174|       |  // to:
  175|       |  // a0:    00 10 01 11  02 12 03 13
  176|       |  // a1:    20 30 21 31  22 32 23 33
  177|       |  // a4:    04 14 05 15  06 16 07 17
  178|       |  // a5:    24 34 25 35  26 36 27 37
  179|   912k|  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
  180|   912k|  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
  181|   912k|  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
  182|   912k|  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
  183|       |
  184|       |  // Unpack 32 bit elements resulting in:
  185|       |  // b0: 00 10 20 30  01 11 21 31
  186|       |  // b2: 04 14 24 34  05 15 25 35
  187|       |  // b4: 02 12 22 32  03 13 23 33
  188|       |  // b6: 06 16 26 36  07 17 27 37
  189|   912k|  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
  190|   912k|  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
  191|   912k|  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
  192|   912k|  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
  193|       |
  194|       |  // Unpack 64 bit elements resulting in:
  195|       |  // out[0]: 00 10 20 30  XX XX XX XX
  196|       |  // out[1]: 01 11 21 31  XX XX XX XX
  197|       |  // out[2]: 02 12 22 32  XX XX XX XX
  198|       |  // out[3]: 03 13 23 33  XX XX XX XX
  199|       |  // out[4]: 04 14 24 34  XX XX XX XX
  200|       |  // out[5]: 05 15 25 35  XX XX XX XX
  201|       |  // out[6]: 06 16 26 36  XX XX XX XX
  202|       |  // out[7]: 07 17 27 37  XX XX XX XX
  203|   912k|  const __m128i zeros = _mm_setzero_si128();
  204|   912k|  out[0] = _mm_unpacklo_epi64(b0, zeros);
  205|   912k|  out[1] = _mm_unpackhi_epi64(b0, zeros);
  206|   912k|  out[2] = _mm_unpacklo_epi64(b4, zeros);
  207|   912k|  out[3] = _mm_unpackhi_epi64(b4, zeros);
  208|   912k|  out[4] = _mm_unpacklo_epi64(b2, zeros);
  209|   912k|  out[5] = _mm_unpackhi_epi64(b2, zeros);
  210|   912k|  out[6] = _mm_unpacklo_epi64(b6, zeros);
  211|   912k|  out[7] = _mm_unpackhi_epi64(b6, zeros);
  212|   912k|}
av1_inv_txfm_ssse3.c:transpose_16bit_4x8:
  125|  1.57M|                                       __m128i *const out) {
  126|       |  // Unpack 16 bit elements. Goes from:
  127|       |  // in[0]: 00 01 02 03  XX XX XX XX
  128|       |  // in[1]: 10 11 12 13  XX XX XX XX
  129|       |  // in[2]: 20 21 22 23  XX XX XX XX
  130|       |  // in[3]: 30 31 32 33  XX XX XX XX
  131|       |  // in[4]: 40 41 42 43  XX XX XX XX
  132|       |  // in[5]: 50 51 52 53  XX XX XX XX
  133|       |  // in[6]: 60 61 62 63  XX XX XX XX
  134|       |  // in[7]: 70 71 72 73  XX XX XX XX
  135|       |  // to:
  136|       |  // a0:    00 10 01 11  02 12 03 13
  137|       |  // a1:    20 30 21 31  22 32 23 33
  138|       |  // a2:    40 50 41 51  42 52 43 53
  139|       |  // a3:    60 70 61 71  62 72 63 73
  140|  1.57M|  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
  141|  1.57M|  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
  142|  1.57M|  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
  143|  1.57M|  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
  144|       |
  145|       |  // Unpack 32 bit elements resulting in:
  146|       |  // b0: 00 10 20 30  01 11 21 31
  147|       |  // b1: 40 50 60 70  41 51 61 71
  148|       |  // b2: 02 12 22 32  03 13 23 33
  149|       |  // b3: 42 52 62 72  43 53 63 73
  150|  1.57M|  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
  151|  1.57M|  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
  152|  1.57M|  const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
  153|  1.57M|  const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
  154|       |
  155|       |  // Unpack 64 bit elements resulting in:
  156|       |  // out[0]: 00 10 20 30  40 50 60 70
  157|       |  // out[1]: 01 11 21 31  41 51 61 71
  158|       |  // out[2]: 02 12 22 32  42 52 62 72
  159|       |  // out[3]: 03 13 23 33  43 53 63 73
  160|  1.57M|  out[0] = _mm_unpacklo_epi64(b0, b1);
  161|  1.57M|  out[1] = _mm_unpackhi_epi64(b0, b1);
  162|  1.57M|  out[2] = _mm_unpacklo_epi64(b2, b3);
  163|  1.57M|  out[3] = _mm_unpackhi_epi64(b2, b3);
  164|  1.57M|}
highbd_inv_txfm_sse4.c:transpose_32bit_4x4:
  300|  6.31M|                                       __m128i *const out) {
  301|       |  // Unpack 32 bit elements. Goes from:
  302|       |  // in[0]: 00 01 02 03
  303|       |  // in[1]: 10 11 12 13
  304|       |  // in[2]: 20 21 22 23
  305|       |  // in[3]: 30 31 32 33
  306|       |  // to:
  307|       |  // a0:    00 10 01 11
  308|       |  // a1:    20 30 21 31
  309|       |  // a2:    02 12 03 13
  310|       |  // a3:    22 32 23 33
  311|       |
  312|  6.31M|  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
  313|  6.31M|  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
  314|  6.31M|  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
  315|  6.31M|  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
  316|       |
  317|       |  // Unpack 64 bit elements resulting in:
  318|       |  // out[0]: 00 10 20 30
  319|       |  // out[1]: 01 11 21 31
  320|       |  // out[2]: 02 12 22 32
  321|       |  // out[3]: 03 13 23 33
  322|  6.31M|  out[0] = _mm_unpacklo_epi64(a0, a1);
  323|  6.31M|  out[1] = _mm_unpackhi_epi64(a0, a1);
  324|  6.31M|  out[2] = _mm_unpacklo_epi64(a2, a3);
  325|  6.31M|  out[3] = _mm_unpackhi_epi64(a2, a3);
  326|  6.31M|}

av1_inv_txfm_avx2.c:pair_set_w16_epi16:
   23|  58.7M|static inline __m256i pair_set_w16_epi16(int16_t a, int16_t b) {
   24|  58.7M|  return _mm256_set1_epi32(
   25|  58.7M|      (int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)));
   26|  58.7M|}
av1_inv_txfm_avx2.c:btf_16_adds_subs_avx2:
   52|   104M|static inline void btf_16_adds_subs_avx2(__m256i *in0, __m256i *in1) {
   53|   104M|  const __m256i _in0 = *in0;
   54|   104M|  const __m256i _in1 = *in1;
   55|   104M|  *in0 = _mm256_adds_epi16(_in0, _in1);
   56|   104M|  *in1 = _mm256_subs_epi16(_in0, _in1);
   57|   104M|}
av1_inv_txfm_avx2.c:btf_16_w16_avx2:
   30|  54.2M|                                   const int32_t cos_bit) {
   31|  54.2M|  __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1);
   32|  54.2M|  __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1);
   33|  54.2M|  __m256i u0 = _mm256_madd_epi16(t0, w0);
   34|  54.2M|  __m256i u1 = _mm256_madd_epi16(t1, w0);
   35|  54.2M|  __m256i v0 = _mm256_madd_epi16(t0, w1);
   36|  54.2M|  __m256i v1 = _mm256_madd_epi16(t1, w1);
   37|       |
   38|  54.2M|  __m256i a0 = _mm256_add_epi32(u0, _r);
   39|  54.2M|  __m256i a1 = _mm256_add_epi32(u1, _r);
   40|  54.2M|  __m256i b0 = _mm256_add_epi32(v0, _r);
   41|  54.2M|  __m256i b1 = _mm256_add_epi32(v1, _r);
   42|       |
   43|  54.2M|  __m256i c0 = _mm256_srai_epi32(a0, cos_bit);
   44|  54.2M|  __m256i c1 = _mm256_srai_epi32(a1, cos_bit);
   45|  54.2M|  __m256i d0 = _mm256_srai_epi32(b0, cos_bit);
   46|  54.2M|  __m256i d1 = _mm256_srai_epi32(b1, cos_bit);
   47|       |
   48|  54.2M|  *in0 = _mm256_packs_epi32(c0, c1);
   49|  54.2M|  *in1 = _mm256_packs_epi32(d0, d1);
   50|  54.2M|}
av1_inv_txfm_avx2.c:btf_16_adds_subs_out_avx2:
   67|  39.6M|                                             __m256i in0, __m256i in1) {
   68|  39.6M|  const __m256i _in0 = in0;
   69|  39.6M|  const __m256i _in1 = in1;
   70|  39.6M|  *out0 = _mm256_adds_epi16(_in0, _in1);
   71|  39.6M|  *out1 = _mm256_subs_epi16(_in0, _in1);
   72|  39.6M|}
av1_inv_txfm_avx2.c:load_buffer_32bit_to_16bit_w16_avx2:
  111|  1.58M|                                                       int out_size) {
  112|  28.5M|  for (int i = 0; i < out_size; ++i) {
  ------------------
  |  Branch (112:19): [True: 26.9M, False: 1.58M]
  ------------------
  113|  26.9M|    out[i] = load_32bit_to_16bit_w16_avx2(in + i * stride);
  114|  26.9M|  }
  115|  1.58M|}
av1_inv_txfm_avx2.c:load_32bit_to_16bit_w16_avx2:
  103|  27.8M|static inline __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) {
  104|  27.8M|  const __m256i a_low = _mm256_lddqu_si256((const __m256i *)a);
  105|  27.8M|  const __m256i b = _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8));
  106|  27.8M|  return _mm256_permute4x64_epi64(b, 0xD8);
  107|  27.8M|}
av1_inv_txfm_avx2.c:flip_buf_avx2:
  228|  16.9k|static inline void flip_buf_avx2(__m256i *in, __m256i *out, int size) {
  229|   288k|  for (int i = 0; i < size; ++i) {
  ------------------
  |  Branch (229:19): [True: 271k, False: 16.9k]
  ------------------
  230|   271k|    out[size - i - 1] = in[i];
  231|   271k|  }
  232|  16.9k|}
av1_inv_txfm_avx2.c:transpose_16bit_16x16_avx2:
  158|  2.95M|                                              __m256i *const out) {
  159|  2.95M|  __m256i t[16];
  160|       |
  161|  2.95M|#define LOADL(idx)                                                            \
  162|  2.95M|  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
  163|  2.95M|  t[idx] = _mm256_inserti128_si256(                                           \
  164|  2.95M|      t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
  165|       |
  166|  2.95M|#define LOADR(idx)                                                           \
  167|  2.95M|  t[8 + idx] =                                                               \
  168|  2.95M|      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
  169|  2.95M|  t[8 + idx] = _mm256_inserti128_si256(                                      \
  170|  2.95M|      t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
  171|       |
  172|       |  // load left 8x16
  173|  2.95M|  LOADL(0)
  ------------------
  |  |  162|  2.95M|  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
  |  |  163|  2.95M|  t[idx] = _mm256_inserti128_si256(                                           \
  |  |  164|  2.95M|      t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
  ------------------
  174|  2.95M|  LOADL(1)
  ------------------
  |  |  162|  2.95M|  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
  |  |  163|  2.95M|  t[idx] = _mm256_inserti128_si256(                                           \
  |  |  164|  2.95M|      t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
  ------------------
  175|  2.95M|  LOADL(2)
  ------------------
  |  |  162|  2.95M|  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
  |  |  163|  2.95M|  t[idx] = _mm256_inserti128_si256(                                           \
  |  |  164|  2.95M|      t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
  ------------------
  176|  2.95M|  LOADL(3)
  ------------------
  |  |  162|  2.95M|  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
  |  |  163|  2.95M|  t[idx] = _mm256_inserti128_si256(                                           \
  |  |  164|  2.95M|      t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
  ------------------
  177|  2.95M|  LOADL(4)
  ------------------
  |  |  162|  2.95M|  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
  |  |  163|  2.95M|  t[idx] = _mm256_inserti128_si256(                                           \
  |  |  164|  2.95M|      t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
  ------------------
  178|  2.95M|  LOADL(5)
  ------------------
  |  |  162|  2.95M|  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
  |  |  163|  2.95M|  t[idx] = _mm256_inserti128_si256(                                           \
  |  |  164|  2.95M|      t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
  ------------------
  179|  2.95M|  LOADL(6)
  ------------------
  |  |  162|  2.95M|  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
  |  |  163|  2.95M|  t[idx] = _mm256_inserti128_si256(                                           \
  |  |  164|  2.95M|      t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
  ------------------
  180|  2.95M|  LOADL(7)
  ------------------
  |  |  162|  2.95M|  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
  |  |  163|  2.95M|  t[idx] = _mm256_inserti128_si256(                                           \
  |  |  164|  2.95M|      t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
  ------------------
  181|       |
  182|       |  // load right 8x16
  183|  2.95M|  LOADR(0)
  ------------------
  |  |  167|  2.95M|  t[8 + idx] =                                                               \
  |  |  168|  2.95M|      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
  |  |  169|  2.95M|  t[8 + idx] = _mm256_inserti128_si256(                                      \
  |  |  170|  2.95M|      t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
  ------------------
  184|  2.95M|  LOADR(1)
  ------------------
  |  |  167|  2.95M|  t[8 + idx] =                                                               \
  |  |  168|  2.95M|      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
  |  |  169|  2.95M|  t[8 + idx] = _mm256_inserti128_si256(                                      \
  |  |  170|  2.95M|      t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
  ------------------
  185|  2.95M|  LOADR(2)
  ------------------
  |  |  167|  2.95M|  t[8 + idx] =                                                               \
  |  |  168|  2.95M|      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
  |  |  169|  2.95M|  t[8 + idx] = _mm256_inserti128_si256(                                      \
  |  |  170|  2.95M|      t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
  ------------------
  186|  2.95M|  LOADR(3)
  ------------------
  |  |  167|  2.95M|  t[8 + idx] =                                                               \
  |  |  168|  2.95M|      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
  |  |  169|  2.95M|  t[8 + idx] = _mm256_inserti128_si256(                                      \
  |  |  170|  2.95M|      t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
  ------------------
  187|  2.95M|  LOADR(4)
  ------------------
  |  |  167|  2.95M|  t[8 + idx] =                                                               \
  |  |  168|  2.95M|      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
  |  |  169|  2.95M|  t[8 + idx] = _mm256_inserti128_si256(                                      \
  |  |  170|  2.95M|      t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
  ------------------
  188|  2.95M|  LOADR(5)
  ------------------
  |  |  167|  2.95M|  t[8 + idx] =                                                               \
  |  |  168|  2.95M|      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
  |  |  169|  2.95M|  t[8 + idx] = _mm256_inserti128_si256(                                      \
  |  |  170|  2.95M|      t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
  ------------------
  189|  2.95M|  LOADR(6)
  ------------------
  |  |  167|  2.95M|  t[8 + idx] =                                                               \
  |  |  168|  2.95M|      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
  |  |  169|  2.95M|  t[8 + idx] = _mm256_inserti128_si256(                                      \
  |  |  170|  2.95M|      t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
  ------------------
  190|  2.95M|  LOADR(7)
  ------------------
  |  |  167|  2.95M|  t[8 + idx] =                                                               \
  |  |  168|  2.95M|      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
  |  |  169|  2.95M|  t[8 + idx] = _mm256_inserti128_si256(                                      \
  |  |  170|  2.95M|      t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
  ------------------
  191|       |
  192|       |  // get the top 16x8 result
  193|  2.95M|  transpose2_8x8_avx2(t, out);
  194|       |  // get the bottom 16x8 result
  195|  2.95M|  transpose2_8x8_avx2(&t[8], &out[8]);
  196|  2.95M|}
av1_inv_txfm_avx2.c:transpose2_8x8_avx2:
  118|  5.91M|                                       __m256i *const out) {
  119|  5.91M|  __m256i t[16], u[16];
  120|       |  // (1st, 2nd) ==> (lo, hi)
  121|       |  //   (0, 1)   ==>  (0, 1)
  122|       |  //   (2, 3)   ==>  (2, 3)
  123|       |  //   (4, 5)   ==>  (4, 5)
  124|       |  //   (6, 7)   ==>  (6, 7)
  125|  29.5M|  for (int i = 0; i < 4; i++) {
  ------------------
  |  Branch (125:19): [True: 23.6M, False: 5.91M]
  ------------------
  126|  23.6M|    t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]);
  127|  23.6M|    t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]);
  128|  23.6M|  }
  129|       |
  130|       |  // (1st, 2nd) ==> (lo, hi)
  131|       |  //   (0, 2)   ==>  (0, 2)
  132|       |  //   (1, 3)   ==>  (1, 3)
  133|       |  //   (4, 6)   ==>  (4, 6)
  134|       |  //   (5, 7)   ==>  (5, 7)
  135|  17.7M|  for (int i = 0; i < 2; i++) {
  ------------------
  |  Branch (135:19): [True: 11.8M, False: 5.91M]
  ------------------
  136|  11.8M|    u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]);
  137|  11.8M|    u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]);
  138|       |
  139|  11.8M|    u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]);
  140|  11.8M|    u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]);
  141|  11.8M|  }
  142|       |
  143|       |  // (1st, 2nd) ==> (lo, hi)
  144|       |  //   (0, 4)   ==>  (0, 1)
  145|       |  //   (1, 5)   ==>  (4, 5)
  146|       |  //   (2, 6)   ==>  (2, 3)
  147|       |  //   (3, 7)   ==>  (6, 7)
  148|  17.7M|  for (int i = 0; i < 2; i++) {
  ------------------
  |  Branch (148:19): [True: 11.8M, False: 5.91M]
  ------------------
  149|  11.8M|    out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]);
  150|  11.8M|    out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]);
  151|       |
  152|  11.8M|    out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]);
  153|  11.8M|    out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]);
  154|  11.8M|  }
  155|  5.91M|}
av1_inv_txfm_avx2.c:round_shift_16bit_w16_avx2:
  234|  18.3k|static inline void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) {
  235|  18.3k|  if (bit < 0) {
  ------------------
  |  Branch (235:7): [True: 18.3k, False: 0]
  ------------------
  236|  18.3k|    bit = -bit;
  237|  18.3k|    __m256i round = _mm256_set1_epi16(1 << (bit - 1));
  238|   312k|    for (int i = 0; i < size; ++i) {
  ------------------
  |  Branch (238:21): [True: 294k, False: 18.3k]
  ------------------
  239|   294k|      in[i] = _mm256_adds_epi16(in[i], round);
  240|   294k|      in[i] = _mm256_srai_epi16(in[i], bit);
  241|   294k|    }
  242|  18.3k|  } else if (bit > 0) {
  ------------------
  |  Branch (242:14): [True: 0, False: 0]
  ------------------
  243|      0|    for (int i = 0; i < size; ++i) {
  ------------------
  |  Branch (243:21): [True: 0, False: 0]
  ------------------
  244|      0|      in[i] = _mm256_slli_epi16(in[i], bit);
  245|      0|    }
  246|      0|  }
  247|  18.3k|}
highbd_inv_txfm_avx2.c:round_shift_rect_array_32_avx2:
  274|  1.22M|                                                  const int val) {
  275|  1.22M|  const __m256i sqrt2 = _mm256_set1_epi32(val);
  276|  1.22M|  if (bit > 0) {
  ------------------
  |  Branch (276:7): [True: 0, False: 1.22M]
  ------------------
  277|      0|    int i;
  278|      0|    for (i = 0; i < size; i++) {
  ------------------
  |  Branch (278:17): [True: 0, False: 0]
  ------------------
  279|      0|      const __m256i r0 = round_shift_32_avx2(input[i], bit);
  280|      0|      const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0);
  281|      0|      output[i] = round_shift_32_avx2(r1, NewSqrt2Bits);
  ------------------
  |  |   41|      0|#define NewSqrt2Bits ((int32_t)12)
  ------------------
  282|      0|    }
  283|  1.22M|  } else {
  284|  1.22M|    int i;
  285|  13.7M|    for (i = 0; i < size; i++) {
  ------------------
  |  Branch (285:17): [True: 12.5M, False: 1.22M]
  ------------------
  286|  12.5M|      const __m256i r0 = _mm256_slli_epi32(input[i], -bit);
  287|  12.5M|      const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0);
  288|  12.5M|      output[i] = round_shift_32_avx2(r1, NewSqrt2Bits);
  ------------------
  |  |   41|  12.5M|#define NewSqrt2Bits ((int32_t)12)
  ------------------
  289|  12.5M|    }
  290|  1.22M|  }
  291|  1.22M|}
highbd_inv_txfm_avx2.c:round_shift_32_avx2:
  249|   235M|static inline __m256i round_shift_32_avx2(__m256i vec, int bit) {
  250|   235M|  __m256i tmp, round;
  251|   235M|  round = _mm256_set1_epi32(1 << (bit - 1));
  252|   235M|  tmp = _mm256_add_epi32(vec, round);
  253|   235M|  return _mm256_srai_epi32(tmp, bit);
  254|   235M|}
highbd_inv_txfm_avx2.c:round_shift_array_32_avx2:
  257|  11.4M|                                             const int size, const int bit) {
  258|  11.4M|  if (bit > 0) {
  ------------------
  |  Branch (258:7): [True: 11.4M, False: 18.4E]
  ------------------
  259|  11.4M|    int i;
  260|   234M|    for (i = 0; i < size; i++) {
  ------------------
  |  Branch (260:17): [True: 223M, False: 11.4M]
  ------------------
  261|   223M|      output[i] = round_shift_32_avx2(input[i], bit);
  262|   223M|    }
  263|  18.4E|  } else {
  264|  18.4E|    int i;
  265|  18.4E|    for (i = 0; i < size; i++) {
  ------------------
  |  Branch (265:17): [True: 0, False: 18.4E]
  ------------------
  266|      0|      output[i] = _mm256_slli_epi32(input[i], -bit);
  267|      0|    }
  268|  18.4E|  }
  269|  11.4M|}

aom_memalign:
   55|  3.67M|void *aom_memalign(size_t align, size_t size) {
   56|  3.67M|  void *x = NULL;
   57|  3.67M|  if (!check_size_argument_overflow(1, size, align)) return NULL;
  ------------------
  |  Branch (57:7): [True: 5, False: 3.67M]
  ------------------
   58|  3.67M|  const size_t aligned_size = size + GetAllocationPaddingSize(align);
   59|  3.67M|  void *const addr = malloc(aligned_size);
   60|  3.67M|  if (addr) {
  ------------------
  |  Branch (60:7): [True: 3.67M, False: 18.4E]
  ------------------
   61|  3.67M|    x = aom_align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, align);
  ------------------
  |  |   49|  3.67M|  (void *)(((uintptr_t)(addr) + ((align)-1)) & ~(uintptr_t)((align)-1))
  ------------------
   62|  3.67M|    SetActualMallocAddress(x, addr);
   63|  3.67M|  }
   64|  3.67M|  return x;
   65|  3.67M|}
aom_malloc:
   67|  1.60M|void *aom_malloc(size_t size) { return aom_memalign(DEFAULT_ALIGNMENT, size); }
  ------------------
  |  |   25|  1.60M|#define DEFAULT_ALIGNMENT (2 * sizeof(void *)) /* NOLINT */
  ------------------
aom_calloc:
   69|   870k|void *aom_calloc(size_t num, size_t size) {
   70|   870k|  if (!check_size_argument_overflow(num, size, DEFAULT_ALIGNMENT)) return NULL;
  ------------------
  |  |   25|   870k|#define DEFAULT_ALIGNMENT (2 * sizeof(void *)) /* NOLINT */
  ------------------
  |  Branch (70:7): [True: 262, False: 869k]
  ------------------
   71|   869k|  const size_t total_size = num * size;
   72|   869k|  void *const x = aom_malloc(total_size);
   73|   869k|  if (x) memset(x, 0, total_size);
  ------------------
  |  Branch (73:7): [True: 869k, False: 0]
  ------------------
   74|   869k|  return x;
   75|   870k|}
aom_free:
   77|  6.00M|void aom_free(void *memblk) {
   78|  6.00M|  if (memblk) {
  ------------------
  |  Branch (78:7): [True: 3.67M, False: 2.33M]
  ------------------
   79|  3.67M|    void *addr = GetActualMallocAddress(memblk);
   80|  3.67M|    free(addr);
   81|  3.67M|  }
   82|  6.00M|}
aom_mem.c:check_size_argument_overflow:
   27|  4.54M|                                        size_t align) {
   28|  4.54M|  if (nmemb == 0) return 1;
  ------------------
  |  Branch (28:7): [True: 10.2k, False: 4.53M]
  ------------------
   29|  4.53M|  const size_t alloc_padding = GetAllocationPaddingSize(align);
   30|  4.53M|#if defined(AOM_MAX_ALLOCABLE_MEMORY)
   31|  4.53M|  assert(AOM_MAX_ALLOCABLE_MEMORY >= alloc_padding);
   32|  4.53M|  assert(AOM_MAX_ALLOCABLE_MEMORY <= SIZE_MAX);
   33|  4.53M|  if (size > (AOM_MAX_ALLOCABLE_MEMORY - alloc_padding) / nmemb) return 0;
  ------------------
  |  Branch (33:7): [True: 267, False: 4.53M]
  ------------------
   34|       |#else
   35|       |  if (size > (SIZE_MAX - alloc_padding) / nmemb) return 0;
   36|       |#endif
   37|  4.53M|  return 1;
   38|  4.53M|}
aom_mem.c:GetAllocationPaddingSize:
   19|  8.20M|static size_t GetAllocationPaddingSize(size_t align) {
   20|  8.20M|  assert(align > 0);
   21|  8.20M|  assert(align < SIZE_MAX - ADDRESS_STORAGE_SIZE);
   22|  8.20M|  return align - 1 + ADDRESS_STORAGE_SIZE;
  ------------------
  |  |   17|  8.20M|#define ADDRESS_STORAGE_SIZE sizeof(size_t)
  ------------------
   23|  8.20M|}
aom_mem.c:SetActualMallocAddress:
   45|  3.67M|                                   const void *const malloc_addr) {
   46|  3.67M|  size_t *const malloc_addr_location = GetMallocAddressLocation(mem);
   47|  3.67M|  *malloc_addr_location = (size_t)malloc_addr;
   48|  3.67M|}
aom_mem.c:GetMallocAddressLocation:
   40|  7.34M|static size_t *GetMallocAddressLocation(void *const mem) {
   41|  7.34M|  return ((size_t *)mem) - 1;
   42|  7.34M|}
aom_mem.c:GetActualMallocAddress:
   50|  3.67M|static void *GetActualMallocAddress(void *const mem) {
   51|  3.67M|  const size_t *const malloc_addr_location = GetMallocAddressLocation(mem);
   52|  3.67M|  return (void *)(*malloc_addr_location);
   53|  3.67M|}

decodeframe.c:aom_memset16:
   40|  11.8M|static inline void *aom_memset16(void *dest, int val, size_t length) {
   41|  11.8M|  size_t i;
   42|  11.8M|  uint16_t *dest16 = (uint16_t *)dest;
   43|   204M|  for (i = 0; i < length; i++) *dest16++ = val;
  ------------------
  |  Branch (43:15): [True: 192M, False: 11.8M]
  ------------------
   44|  11.8M|  return dest;
   45|  11.8M|}
intrapred.c:aom_memset16:
   40|  43.6M|static inline void *aom_memset16(void *dest, int val, size_t length) {
   41|  43.6M|  size_t i;
   42|  43.6M|  uint16_t *dest16 = (uint16_t *)dest;
   43|  1.74G|  for (i = 0; i < length; i++) *dest16++ = val;
  ------------------
  |  Branch (43:15): [True: 1.70G, False: 43.6M]
  ------------------
   44|  43.6M|  return dest;
   45|  43.6M|}
yv12extend.c:aom_memset16:
   40|  5.94M|static inline void *aom_memset16(void *dest, int val, size_t length) {
   41|  5.94M|  size_t i;
   42|  5.94M|  uint16_t *dest16 = (uint16_t *)dest;
   43|  1.66G|  for (i = 0; i < length; i++) *dest16++ = val;
  ------------------
  |  Branch (43:15): [True: 1.66G, False: 5.94M]
  ------------------
   44|  5.94M|  return dest;
   45|  5.94M|}
reconintra.c:aom_memset16:
   40|   199M|static inline void *aom_memset16(void *dest, int val, size_t length) {
   41|   199M|  size_t i;
   42|   199M|  uint16_t *dest16 = (uint16_t *)dest;
   43|  31.0G|  for (i = 0; i < length; i++) *dest16++ = val;
  ------------------
  |  Branch (43:15): [True: 30.8G, False: 199M]
  ------------------
   44|   199M|  return dest;
   45|   199M|}
resize.c:aom_memset16:
   40|  3.01M|static inline void *aom_memset16(void *dest, int val, size_t length) {
   41|  3.01M|  size_t i;
   42|  3.01M|  uint16_t *dest16 = (uint16_t *)dest;
   43|  18.0M|  for (i = 0; i < length; i++) *dest16++ = val;
  ------------------
  |  Branch (43:15): [True: 15.0M, False: 3.01M]
  ------------------
   44|  3.01M|  return dest;
   45|  3.01M|}
restoration.c:aom_memset16:
   40|   848k|static inline void *aom_memset16(void *dest, int val, size_t length) {
   41|   848k|  size_t i;
   42|   848k|  uint16_t *dest16 = (uint16_t *)dest;
   43|  4.24M|  for (i = 0; i < length; i++) *dest16++ = val;
  ------------------
  |  Branch (43:15): [True: 3.39M, False: 848k]
  ------------------
   44|   848k|  return dest;
   45|   848k|}

aom_dsp_rtcd.c:aom_once:
   65|  16.1k|static void aom_once(void (*func)(void)) {
   66|  16.1k|  static pthread_once_t lock = PTHREAD_ONCE_INIT;
   67|  16.1k|  pthread_once(&lock, func);
   68|  16.1k|}
aom_scale_rtcd.c:aom_once:
   65|  16.1k|static void aom_once(void (*func)(void)) {
   66|  16.1k|  static pthread_once_t lock = PTHREAD_ONCE_INIT;
   67|  16.1k|  pthread_once(&lock, func);
   68|  16.1k|}
av1_rtcd.c:aom_once:
   65|  16.1k|static void aom_once(void (*func)(void)) {
   66|  16.1k|  static pthread_once_t lock = PTHREAD_ONCE_INIT;
   67|  16.1k|  pthread_once(&lock, func);
   68|  16.1k|}
reconinter.c:aom_once:
   65|  16.1k|static void aom_once(void (*func)(void)) {
   66|  16.1k|  static pthread_once_t lock = PTHREAD_ONCE_INIT;
   67|  16.1k|  pthread_once(&lock, func);
   68|  16.1k|}
reconintra.c:aom_once:
   65|  16.1k|static void aom_once(void (*func)(void)) {
   66|  16.1k|  static pthread_once_t lock = PTHREAD_ONCE_INIT;
   67|  16.1k|  pthread_once(&lock, func);
   68|  16.1k|}

decodeframe.c:get_msb:
   42|   451k|static inline int get_msb(unsigned int n) {
   43|   451k|  assert(n != 0);
   44|   451k|  return 31 ^ __builtin_clz(n);
   45|   451k|}
decodemv.c:aom_ceil_log2:
   74|   708k|static inline int aom_ceil_log2(int n) {
   75|   708k|  if (n < 2) return 0;
  ------------------
  |  Branch (75:7): [True: 369k, False: 339k]
  ------------------
   76|   339k|  return get_msb(n - 1) + 1;
   77|   708k|}
decodemv.c:get_msb:
   42|   339k|static inline int get_msb(unsigned int n) {
   43|   339k|  assert(n != 0);
   44|   339k|  return 31 ^ __builtin_clz(n);
   45|   339k|}
detokenize.c:get_msb:
   42|   161k|static inline int get_msb(unsigned int n) {
   43|   161k|  assert(n != 0);
   44|   161k|  return 31 ^ __builtin_clz(n);
   45|   161k|}
bitreader_buffer.c:get_msb:
   42|  29.9k|static inline int get_msb(unsigned int n) {
   43|  29.9k|  assert(n != 0);
   44|  29.9k|  return 31 ^ __builtin_clz(n);
   45|  29.9k|}
binary_codes_reader.c:get_msb:
   42|   111k|static inline int get_msb(unsigned int n) {
   43|   111k|  assert(n != 0);
   44|   111k|  return 31 ^ __builtin_clz(n);
   45|   111k|}
entdec.c:get_msb:
   42|   639M|static inline int get_msb(unsigned int n) {
   43|   639M|  assert(n != 0);
   44|   639M|  return 31 ^ __builtin_clz(n);
   45|   639M|}
cdef_block.c:get_msb:
   42|  28.6M|static inline int get_msb(unsigned int n) {
   43|  28.6M|  assert(n != 0);
   44|  28.6M|  return 31 ^ __builtin_clz(n);
   45|  28.6M|}
warped_motion.c:get_msb:
   42|   867k|static inline int get_msb(unsigned int n) {
   43|   867k|  assert(n != 0);
   44|   867k|  return 31 ^ __builtin_clz(n);
   45|   867k|}
cdef_block_avx2.c:get_msb:
   42|   245M|static inline int get_msb(unsigned int n) {
   43|   245M|  assert(n != 0);
   44|   245M|  return 31 ^ __builtin_clz(n);
   45|   245M|}

av1_dec_fuzzer.cc:_ZL19mem_get_le32_as_intPKv:
  125|   292k|static unsigned MEM_VALUE_T mem_get_le32(const void *vmem) {
  126|   292k|  unsigned MEM_VALUE_T val;
  127|   292k|  const MAU_T *mem = (const MAU_T *)vmem;
  128|       |
  129|   292k|  val = ((unsigned MEM_VALUE_T)mem[3]) << 24;
  130|   292k|  val |= mem[2] << 16;
  131|   292k|  val |= mem[1] << 8;
  132|   292k|  val |= mem[0];
  133|   292k|  return val;
  134|   292k|}
decodeframe.c:mem_get_le16_as_int:
  102|  7.03k|static unsigned MEM_VALUE_T mem_get_le16(const void *vmem) {
  103|  7.03k|  unsigned MEM_VALUE_T val;
  104|  7.03k|  const MAU_T *mem = (const MAU_T *)vmem;
  105|       |
  106|  7.03k|  val = mem[1] << 8;
  107|  7.03k|  val |= mem[0];
  108|  7.03k|  return val;
  109|  7.03k|}
decodeframe.c:mem_get_le24_as_int:
  113|  1.37k|static unsigned MEM_VALUE_T mem_get_le24(const void *vmem) {
  114|  1.37k|  unsigned MEM_VALUE_T val;
  115|  1.37k|  const MAU_T *mem = (const MAU_T *)vmem;
  116|       |
  117|  1.37k|  val = mem[2] << 16;
  118|  1.37k|  val |= mem[1] << 8;
  119|  1.37k|  val |= mem[0];
  120|  1.37k|  return val;
  121|  1.37k|}
decodeframe.c:mem_get_le32_as_int:
  125|  1.04k|static unsigned MEM_VALUE_T mem_get_le32(const void *vmem) {
  126|  1.04k|  unsigned MEM_VALUE_T val;
  127|  1.04k|  const MAU_T *mem = (const MAU_T *)vmem;
  128|       |
  129|  1.04k|  val = ((unsigned MEM_VALUE_T)mem[3]) << 24;
  130|  1.04k|  val |= mem[2] << 16;
  131|  1.04k|  val |= mem[1] << 8;
  132|  1.04k|  val |= mem[0];
  133|  1.04k|  return val;
  134|  1.04k|}

aom_dsp_rtcd.c:x86_simd_caps:
  197|      1|static inline int x86_simd_caps(void) {
  198|      1|  unsigned int flags = 0;
  199|      1|  unsigned int mask = ~0u;
  200|      1|  unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx;
  201|      1|  char *env;
  202|       |
  203|       |  /* See if the CPU capabilities are being overridden by the environment */
  204|      1|  env = getenv("AOM_SIMD_CAPS");
  205|      1|  if (env && *env) return (int)strtol(env, NULL, 0);
  ------------------
  |  Branch (205:7): [True: 0, False: 1]
  |  Branch (205:14): [True: 0, False: 0]
  ------------------
  206|       |
  207|      1|  env = getenv("AOM_SIMD_CAPS_MASK");
  208|      1|  if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0);
  ------------------
  |  Branch (208:7): [True: 0, False: 1]
  |  Branch (208:14): [True: 0, False: 0]
  ------------------
  209|       |
  210|       |  /* Ensure that the CPUID instruction supports extended features */
  211|      1|  cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx);
  ------------------
  |  |   49|      1|  __asm__ __volatile__("cpuid           \n\t"                   \
  |  |   50|      1|                       : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \
  |  |   51|      1|                       : "a"(func), "c"(func2))
  ------------------
  212|       |
  213|      1|  if (max_cpuid_val < 1) return 0;
  ------------------
  |  Branch (213:7): [True: 0, False: 1]
  ------------------
  214|       |
  215|       |  /* Get the standard feature flags */
  216|      1|  cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
  ------------------
  |  |   49|      1|  __asm__ __volatile__("cpuid           \n\t"                   \
  |  |   50|      1|                       : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \
  |  |   51|      1|                       : "a"(func), "c"(func2))
  ------------------
  217|       |
  218|      1|  flags |= FEATURE_SET(reg_edx, MMX) ? HAS_MMX : 0;
  ------------------
  |  |  195|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_edx, MMX) ? HAS_MMX : 0;
  ------------------
  |  |  161|      1|#define HAS_MMX 0x01
  ------------------
  219|      1|  flags |= FEATURE_SET(reg_edx, SSE) ? HAS_SSE : 0;
  ------------------
  |  |  195|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_edx, SSE) ? HAS_SSE : 0;
  ------------------
  |  |  162|      1|#define HAS_SSE 0x02
  ------------------
  220|      1|  flags |= FEATURE_SET(reg_edx, SSE2) ? HAS_SSE2 : 0;
  ------------------
  |  |  195|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_edx, SSE2) ? HAS_SSE2 : 0;
  ------------------
  |  |  163|      1|#define HAS_SSE2 0x04
  ------------------
  221|      1|  flags |= FEATURE_SET(reg_ecx, SSE3) ? HAS_SSE3 : 0;
  ------------------
  |  |  195|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_ecx, SSE3) ? HAS_SSE3 : 0;
  ------------------
  |  |  164|      1|#define HAS_SSE3 0x08
  ------------------
  222|      1|  flags |= FEATURE_SET(reg_ecx, SSSE3) ? HAS_SSSE3 : 0;
  ------------------
  |  |  195|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_ecx, SSSE3) ? HAS_SSSE3 : 0;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  223|      1|  flags |= FEATURE_SET(reg_ecx, SSE4_1) ? HAS_SSE4_1 : 0;
  ------------------
  |  |  195|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_ecx, SSE4_1) ? HAS_SSE4_1 : 0;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  224|       |
  225|       |  // bits 27 (OSXSAVE) & 28 (256-bit AVX)
  226|      1|  if (FEATURE_SET(reg_ecx, AVX)) {
  ------------------
  |  |  195|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
  227|       |    // Check for OS-support of YMM state. Necessary for AVX and AVX2.
  228|      1|    if ((xgetbv() & 0x6) == 0x6) {
  ------------------
  |  Branch (228:9): [True: 1, False: 0]
  ------------------
  229|      1|      flags |= HAS_AVX;
  ------------------
  |  |  167|      1|#define HAS_AVX 0x40
  ------------------
  230|      1|      if (max_cpuid_val >= 7) {
  ------------------
  |  Branch (230:11): [True: 1, False: 0]
  ------------------
  231|       |        /* Get the leaf 7 feature flags. Needed to check for AVX2 support */
  232|      1|        cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
  ------------------
  |  |   49|      1|  __asm__ __volatile__("cpuid           \n\t"                   \
  |  |   50|      1|                       : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \
  |  |   51|      1|                       : "a"(func), "c"(func2))
  ------------------
  233|      1|        flags |= FEATURE_SET(reg_ebx, AVX2) ? HAS_AVX2 : 0;
  ------------------
  |  |  195|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                      flags |= FEATURE_SET(reg_ebx, AVX2) ? HAS_AVX2 : 0;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  234|       |        // Check for OS-support of ZMM and YMM state. Necessary for AVX512.
  235|       |        // Only set HAS_AVX512 flag if AVX512_DL feature are supported.
  236|       |        // Older AVX512 implementations (such as Skylake) have turbo curves that
  237|       |        // are currently problematic for mixed AVX512/AVX2 code
  238|      1|        if ((xgetbv() & 0xe6) == 0xe6) {
  ------------------
  |  Branch (238:13): [True: 0, False: 1]
  ------------------
  239|      0|          flags |=
  240|      0|              FEATURE_SET(reg_ebx, AVX512) && FEATURE_SET(reg_ecx, AVX512_DL)
  ------------------
  |  |  195|      0|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                            FEATURE_SET(reg_ebx, AVX512) && FEATURE_SET(reg_ecx, AVX512_DL)
  ------------------
  |  |  195|      0|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  241|      0|                  ? HAS_AVX512
  ------------------
  |  |  170|      0|#define HAS_AVX512 0x200
  ------------------
  242|      0|                  : 0;
  243|      0|        }
  244|      1|      }
  245|      1|    }
  246|      1|  }
  247|      1|  (void)reg_eax;  // Avoid compiler warning on unused-but-set variable.
  248|      1|  return flags & mask;
  249|      1|}
aom_dsp_rtcd.c:xgetbv:
  121|      2|static inline uint64_t xgetbv(void) {
  122|      2|  const uint32_t ecx = 0;
  123|      2|  uint32_t eax, edx;
  124|       |  // Use the raw opcode for xgetbv for compatibility with older toolchains.
  125|      2|  __asm__ volatile(".byte 0x0f, 0x01, 0xd0\n"
  126|      2|                   : "=a"(eax), "=d"(edx)
  127|      2|                   : "c"(ecx));
  128|      2|  return ((uint64_t)edx << 32) | eax;
  129|      2|}
aom_scale_rtcd.c:x86_simd_caps:
  197|      1|static inline int x86_simd_caps(void) {
  198|      1|  unsigned int flags = 0;
  199|      1|  unsigned int mask = ~0u;
  200|      1|  unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx;
  201|      1|  char *env;
  202|       |
  203|       |  /* See if the CPU capabilities are being overridden by the environment */
  204|      1|  env = getenv("AOM_SIMD_CAPS");
  205|      1|  if (env && *env) return (int)strtol(env, NULL, 0);
  ------------------
  |  Branch (205:7): [True: 0, False: 1]
  |  Branch (205:14): [True: 0, False: 0]
  ------------------
  206|       |
  207|      1|  env = getenv("AOM_SIMD_CAPS_MASK");
  208|      1|  if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0);
  ------------------
  |  Branch (208:7): [True: 0, False: 1]
  |  Branch (208:14): [True: 0, False: 0]
  ------------------
  209|       |
  210|       |  /* Ensure that the CPUID instruction supports extended features */
  211|      1|  cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx);
  ------------------
  |  |   49|      1|  __asm__ __volatile__("cpuid           \n\t"                   \
  |  |   50|      1|                       : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \
  |  |   51|      1|                       : "a"(func), "c"(func2))
  ------------------
  212|       |
  213|      1|  if (max_cpuid_val < 1) return 0;
  ------------------
  |  Branch (213:7): [True: 0, False: 1]
  ------------------
  214|       |
  215|       |  /* Get the standard feature flags */
  216|      1|  cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
  ------------------
  |  |   49|      1|  __asm__ __volatile__("cpuid           \n\t"                   \
  |  |   50|      1|                       : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \
  |  |   51|      1|                       : "a"(func), "c"(func2))
  ------------------
  217|       |
  218|      1|  flags |= FEATURE_SET(reg_edx, MMX) ? HAS_MMX : 0;
  ------------------
  |  |  195|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_edx, MMX) ? HAS_MMX : 0;
  ------------------
  |  |  161|      1|#define HAS_MMX 0x01
  ------------------
  219|      1|  flags |= FEATURE_SET(reg_edx, SSE) ? HAS_SSE : 0;
  ------------------
  |  |  195|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_edx, SSE) ? HAS_SSE : 0;
  ------------------
  |  |  162|      1|#define HAS_SSE 0x02
  ------------------
  220|      1|  flags |= FEATURE_SET(reg_edx, SSE2) ? HAS_SSE2 : 0;
  ------------------
  |  |  195|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_edx, SSE2) ? HAS_SSE2 : 0;
  ------------------
  |  |  163|      1|#define HAS_SSE2 0x04
  ------------------
  221|      1|  flags |= FEATURE_SET(reg_ecx, SSE3) ? HAS_SSE3 : 0;
  ------------------
  |  |  195|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_ecx, SSE3) ? HAS_SSE3 : 0;
  ------------------
  |  |  164|      1|#define HAS_SSE3 0x08
  ------------------
  222|      1|  flags |= FEATURE_SET(reg_ecx, SSSE3) ? HAS_SSSE3 : 0;
  ------------------
  |  |  195|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_ecx, SSSE3) ? HAS_SSSE3 : 0;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  223|      1|  flags |= FEATURE_SET(reg_ecx, SSE4_1) ? HAS_SSE4_1 : 0;
  ------------------
  |  |  195|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_ecx, SSE4_1) ? HAS_SSE4_1 : 0;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  224|       |
  225|       |  // bits 27 (OSXSAVE) & 28 (256-bit AVX)
  226|      1|  if (FEATURE_SET(reg_ecx, AVX)) {
  ------------------
  |  |  195|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
  227|       |    // Check for OS-support of YMM state. Necessary for AVX and AVX2.
  228|      1|    if ((xgetbv() & 0x6) == 0x6) {
  ------------------
  |  Branch (228:9): [True: 1, False: 0]
  ------------------
  229|      1|      flags |= HAS_AVX;
  ------------------
  |  |  167|      1|#define HAS_AVX 0x40
  ------------------
  230|      1|      if (max_cpuid_val >= 7) {
  ------------------
  |  Branch (230:11): [True: 1, False: 0]
  ------------------
  231|       |        /* Get the leaf 7 feature flags. Needed to check for AVX2 support */
  232|      1|        cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
  ------------------
  |  |   49|      1|  __asm__ __volatile__("cpuid           \n\t"                   \
  |  |   50|      1|                       : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \
  |  |   51|      1|                       : "a"(func), "c"(func2))
  ------------------
  233|      1|        flags |= FEATURE_SET(reg_ebx, AVX2) ? HAS_AVX2 : 0;
  ------------------
  |  |  195|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                      flags |= FEATURE_SET(reg_ebx, AVX2) ? HAS_AVX2 : 0;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  234|       |        // Check for OS-support of ZMM and YMM state. Necessary for AVX512.
  235|       |        // Only set HAS_AVX512 flag if AVX512_DL feature are supported.
  236|       |        // Older AVX512 implementations (such as Skylake) have turbo curves that
  237|       |        // are currently problematic for mixed AVX512/AVX2 code
  238|      1|        if ((xgetbv() & 0xe6) == 0xe6) {
  ------------------
  |  Branch (238:13): [True: 0, False: 1]
  ------------------
  239|      0|          flags |=
  240|      0|              FEATURE_SET(reg_ebx, AVX512) && FEATURE_SET(reg_ecx, AVX512_DL)
  ------------------
  |  |  195|      0|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                            FEATURE_SET(reg_ebx, AVX512) && FEATURE_SET(reg_ecx, AVX512_DL)
  ------------------
  |  |  195|      0|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  241|      0|                  ? HAS_AVX512
  ------------------
  |  |  170|      0|#define HAS_AVX512 0x200
  ------------------
  242|      0|                  : 0;
  243|      0|        }
  244|      1|      }
  245|      1|    }
  246|      1|  }
  247|      1|  (void)reg_eax;  // Avoid compiler warning on unused-but-set variable.
  248|      1|  return flags & mask;
  249|      1|}
aom_scale_rtcd.c:xgetbv:
  121|      2|static inline uint64_t xgetbv(void) {
  122|      2|  const uint32_t ecx = 0;
  123|      2|  uint32_t eax, edx;
  124|       |  // Use the raw opcode for xgetbv for compatibility with older toolchains.
  125|      2|  __asm__ volatile(".byte 0x0f, 0x01, 0xd0\n"
  126|      2|                   : "=a"(eax), "=d"(edx)
  127|      2|                   : "c"(ecx));
  128|      2|  return ((uint64_t)edx << 32) | eax;
  129|      2|}
av1_rtcd.c:x86_simd_caps:
  197|      1|static inline int x86_simd_caps(void) {
  198|      1|  unsigned int flags = 0;
  199|      1|  unsigned int mask = ~0u;
  200|      1|  unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx;
  201|      1|  char *env;
  202|       |
  203|       |  /* See if the CPU capabilities are being overridden by the environment */
  204|      1|  env = getenv("AOM_SIMD_CAPS");
  205|      1|  if (env && *env) return (int)strtol(env, NULL, 0);
  ------------------
  |  Branch (205:7): [True: 0, False: 1]
  |  Branch (205:14): [True: 0, False: 0]
  ------------------
  206|       |
  207|      1|  env = getenv("AOM_SIMD_CAPS_MASK");
  208|      1|  if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0);
  ------------------
  |  Branch (208:7): [True: 0, False: 1]
  |  Branch (208:14): [True: 0, False: 0]
  ------------------
  209|       |
  210|       |  /* Ensure that the CPUID instruction supports extended features */
  211|      1|  cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx);
  ------------------
  |  |   49|      1|  __asm__ __volatile__("cpuid           \n\t"                   \
  |  |   50|      1|                       : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \
  |  |   51|      1|                       : "a"(func), "c"(func2))
  ------------------
  212|       |
  213|      1|  if (max_cpuid_val < 1) return 0;
  ------------------
  |  Branch (213:7): [True: 0, False: 1]
  ------------------
  214|       |
  215|       |  /* Get the standard feature flags */
  216|      1|  cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
  ------------------
  |  |   49|      1|  __asm__ __volatile__("cpuid           \n\t"                   \
  |  |   50|      1|                       : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \
  |  |   51|      1|                       : "a"(func), "c"(func2))
  ------------------
  217|       |
  218|      1|  flags |= FEATURE_SET(reg_edx, MMX) ? HAS_MMX : 0;
  ------------------
  |  |  195|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_edx, MMX) ? HAS_MMX : 0;
  ------------------
  |  |  161|      1|#define HAS_MMX 0x01
  ------------------
  219|      1|  flags |= FEATURE_SET(reg_edx, SSE) ? HAS_SSE : 0;
  ------------------
  |  |  195|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_edx, SSE) ? HAS_SSE : 0;
  ------------------
  |  |  162|      1|#define HAS_SSE 0x02
  ------------------
  220|      1|  flags |= FEATURE_SET(reg_edx, SSE2) ? HAS_SSE2 : 0;
  ------------------
  |  |  195|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_edx, SSE2) ? HAS_SSE2 : 0;
  ------------------
  |  |  163|      1|#define HAS_SSE2 0x04
  ------------------
  221|      1|  flags |= FEATURE_SET(reg_ecx, SSE3) ? HAS_SSE3 : 0;
  ------------------
  |  |  195|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_ecx, SSE3) ? HAS_SSE3 : 0;
  ------------------
  |  |  164|      1|#define HAS_SSE3 0x08
  ------------------
  222|      1|  flags |= FEATURE_SET(reg_ecx, SSSE3) ? HAS_SSSE3 : 0;
  ------------------
  |  |  195|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_ecx, SSSE3) ? HAS_SSSE3 : 0;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  223|      1|  flags |= FEATURE_SET(reg_ecx, SSE4_1) ? HAS_SSE4_1 : 0;
  ------------------
  |  |  195|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                flags |= FEATURE_SET(reg_ecx, SSE4_1) ? HAS_SSE4_1 : 0;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  224|       |
  225|       |  // bits 27 (OSXSAVE) & 28 (256-bit AVX)
  226|      1|  if (FEATURE_SET(reg_ecx, AVX)) {
  ------------------
  |  |  195|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
  227|       |    // Check for OS-support of YMM state. Necessary for AVX and AVX2.
  228|      1|    if ((xgetbv() & 0x6) == 0x6) {
  ------------------
  |  Branch (228:9): [True: 1, False: 0]
  ------------------
  229|      1|      flags |= HAS_AVX;
  ------------------
  |  |  167|      1|#define HAS_AVX 0x40
  ------------------
  230|      1|      if (max_cpuid_val >= 7) {
  ------------------
  |  Branch (230:11): [True: 1, False: 0]
  ------------------
  231|       |        /* Get the leaf 7 feature flags. Needed to check for AVX2 support */
  232|      1|        cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
  ------------------
  |  |   49|      1|  __asm__ __volatile__("cpuid           \n\t"                   \
  |  |   50|      1|                       : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \
  |  |   51|      1|                       : "a"(func), "c"(func2))
  ------------------
  233|      1|        flags |= FEATURE_SET(reg_ebx, AVX2) ? HAS_AVX2 : 0;
  ------------------
  |  |  195|      1|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 1, False: 0]
  |  |  ------------------
  ------------------
                      flags |= FEATURE_SET(reg_ebx, AVX2) ? HAS_AVX2 : 0;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  234|       |        // Check for OS-support of ZMM and YMM state. Necessary for AVX512.
  235|       |        // Only set HAS_AVX512 flag if AVX512_DL feature are supported.
  236|       |        // Older AVX512 implementations (such as Skylake) have turbo curves that
  237|       |        // are currently problematic for mixed AVX512/AVX2 code
  238|      1|        if ((xgetbv() & 0xe6) == 0xe6) {
  ------------------
  |  Branch (238:13): [True: 0, False: 1]
  ------------------
  239|      0|          flags |=
  240|      0|              FEATURE_SET(reg_ebx, AVX512) && FEATURE_SET(reg_ecx, AVX512_DL)
  ------------------
  |  |  195|      0|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                            FEATURE_SET(reg_ebx, AVX512) && FEATURE_SET(reg_ecx, AVX512_DL)
  ------------------
  |  |  195|      0|  (((reg) & (feature##_BITS)) == (feature##_BITS))
  |  |  ------------------
  |  |  |  Branch (195:3): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  241|      0|                  ? HAS_AVX512
  ------------------
  |  |  170|      0|#define HAS_AVX512 0x200
  ------------------
  242|      0|                  : 0;
  243|      0|        }
  244|      1|      }
  245|      1|    }
  246|      1|  }
  247|      1|  (void)reg_eax;  // Avoid compiler warning on unused-but-set variable.
  248|      1|  return flags & mask;
  249|      1|}
av1_rtcd.c:xgetbv:
  121|      2|static inline uint64_t xgetbv(void) {
  122|      2|  const uint32_t ecx = 0;
  123|      2|  uint32_t eax, edx;
  124|       |  // Use the raw opcode for xgetbv for compatibility with older toolchains.
  125|      2|  __asm__ volatile(".byte 0x0f, 0x01, 0xd0\n"
  126|      2|                   : "=a"(eax), "=d"(edx)
  127|      2|                   : "c"(ecx));
  128|      2|  return ((uint64_t)edx << 32) | eax;
  129|      2|}

aom_scale_rtcd:
   18|  16.1k|void aom_scale_rtcd(void) { aom_once(setup_rtcd_internal); }

aom_free_frame_buffer:
   34|   310k|int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
   35|   310k|  if (ybf) {
  ------------------
  |  Branch (35:7): [True: 310k, False: 0]
  ------------------
   36|   310k|    if (ybf->buffer_alloc_sz > 0) {
  ------------------
  |  Branch (36:9): [True: 12.9k, False: 297k]
  ------------------
   37|  12.9k|      aom_free(ybf->buffer_alloc);
   38|  12.9k|    }
   39|       |#if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
   40|       |    if (ybf->y_pyramid) {
   41|       |      aom_free_pyramid(ybf->y_pyramid);
   42|       |    }
   43|       |    if (ybf->corners) {
   44|       |      av1_free_corner_list(ybf->corners);
   45|       |    }
   46|       |#endif  // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
   47|   310k|    aom_remove_metadata_from_frame_buffer(ybf);
   48|       |    /* buffer_alloc isn't accessed by most functions.  Rather y_buffer,
   49|       |      u_buffer and v_buffer point to buffer_alloc and are used.  Clear out
   50|       |      all of this so that a freed pointer isn't inadvertently used */
   51|   310k|    memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG));
   52|   310k|    return 0;
   53|   310k|  }
   54|       |
   55|      0|  return AOM_CODEC_MEM_ERROR;
   56|   310k|}
aom_realloc_frame_buffer:
  241|   516k|                             bool alloc_pyramid, int alloc_y_plane_only) {
  242|   516k|  if (ybf) {
  ------------------
  |  Branch (242:7): [True: 516k, False: 0]
  ------------------
  243|   516k|    int y_stride = 0;
  244|   516k|    int uv_stride = 0;
  245|   516k|    uint64_t yplane_size = 0;
  246|   516k|    uint64_t uvplane_size = 0;
  247|   516k|    const int aligned_width = (width + 7) & ~7;
  248|   516k|    const int aligned_height = (height + 7) & ~7;
  249|   516k|    const int uv_width = aligned_width >> ss_x;
  250|   516k|    const int uv_height = aligned_height >> ss_y;
  251|   516k|    const int uv_border_w = border >> ss_x;
  252|   516k|    const int uv_border_h = border >> ss_y;
  253|       |
  254|   516k|    int error = calc_stride_and_planesize(
  255|   516k|        ss_x, ss_y, aligned_width, aligned_height, border, byte_alignment,
  256|   516k|        alloc_y_plane_only, &y_stride, &uv_stride, &yplane_size, &uvplane_size,
  257|   516k|        uv_height);
  258|   516k|    if (error) return error;
  ------------------
  |  Branch (258:9): [True: 0, False: 516k]
  ------------------
  259|   516k|    return realloc_frame_buffer_aligned(
  260|   516k|        ybf, width, height, ss_x, ss_y, use_highbitdepth, border,
  261|   516k|        byte_alignment, fb, cb, cb_priv, y_stride, yplane_size, uvplane_size,
  262|   516k|        aligned_width, aligned_height, uv_width, uv_height, uv_stride,
  263|   516k|        uv_border_w, uv_border_h, alloc_pyramid, alloc_y_plane_only);
  264|   516k|  }
  265|      0|  return AOM_CODEC_MEM_ERROR;
  266|   516k|}
aom_alloc_frame_buffer:
  271|  10.4k|                           int alloc_y_plane_only) {
  272|  10.4k|  if (ybf) {
  ------------------
  |  Branch (272:7): [True: 10.4k, False: 0]
  ------------------
  273|  10.4k|    aom_free_frame_buffer(ybf);
  274|  10.4k|    return aom_realloc_frame_buffer(
  275|  10.4k|        ybf, width, height, ss_x, ss_y, use_highbitdepth, border,
  276|  10.4k|        byte_alignment, NULL, NULL, NULL, alloc_pyramid, alloc_y_plane_only);
  277|  10.4k|  }
  278|      0|  return AOM_CODEC_MEM_ERROR;
  279|  10.4k|}
aom_remove_metadata_from_frame_buffer:
  281|   310k|void aom_remove_metadata_from_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
  282|   310k|  if (ybf && ybf->metadata) {
  ------------------
  |  Branch (282:7): [True: 310k, False: 0]
  |  Branch (282:14): [True: 0, False: 310k]
  ------------------
  283|      0|    aom_img_metadata_array_free(ybf->metadata);
  284|      0|    ybf->metadata = NULL;
  285|      0|  }
  286|   310k|}
yv12config.c:calc_stride_and_planesize:
  213|   516k|    uint64_t *yplane_size, uint64_t *uvplane_size, const int uv_height) {
  214|       |  /* Only support allocating buffers that have a border that's a multiple
  215|       |   * of 32. The border restriction is required to get 16-byte alignment of
  216|       |   * the start of the chroma rows without introducing an arbitrary gap
  217|       |   * between planes, which would break the semantics of things like
  218|       |   * aom_img_set_rect(). */
  219|   516k|  if (border & 0x1f) return AOM_CODEC_MEM_ERROR;
  ------------------
  |  Branch (219:7): [True: 0, False: 516k]
  ------------------
  220|   516k|  *y_stride = aom_calc_y_stride(aligned_width, border);
  221|   516k|  *yplane_size =
  222|   516k|      (aligned_height + 2 * border) * (uint64_t)(*y_stride) + byte_alignment;
  223|       |
  224|   516k|  if (!alloc_y_plane_only) {
  ------------------
  |  Branch (224:7): [True: 516k, False: 0]
  ------------------
  225|   516k|    *uv_stride = *y_stride >> ss_x;
  226|   516k|    *uvplane_size =
  227|   516k|        (uv_height + 2 * (border >> ss_y)) * (uint64_t)(*uv_stride) +
  228|   516k|        byte_alignment;
  229|   516k|  } else {
  230|      0|    *uv_stride = 0;
  231|      0|    *uvplane_size = 0;
  232|      0|  }
  233|   516k|  return 0;
  234|   516k|}
yv12config.c:realloc_frame_buffer_aligned:
   66|   516k|    bool alloc_pyramid, int alloc_y_plane_only) {
   67|   516k|  if (ybf) {
  ------------------
  |  Branch (67:7): [True: 516k, False: 0]
  ------------------
   68|   516k|    const int aom_byte_align = (byte_alignment == 0) ? 1 : byte_alignment;
  ------------------
  |  Branch (68:32): [True: 516k, False: 0]
  ------------------
   69|   516k|    const uint64_t frame_size =
   70|   516k|        (1 + use_highbitdepth) * (yplane_size + 2 * uvplane_size);
   71|       |
   72|   516k|    uint8_t *buf = NULL;
   73|       |
   74|   516k|#if CONFIG_REALTIME_ONLY || !CONFIG_AV1_ENCODER
   75|       |    // We should only need an 8-bit version of the source frame if we are
   76|       |    // encoding in non-realtime mode
   77|   516k|    (void)alloc_pyramid;
   78|   516k|    assert(!alloc_pyramid);
   79|   516k|#endif  // CONFIG_REALTIME_ONLY || !CONFIG_AV1_ENCODER
   80|       |
   81|   516k|#if defined AOM_MAX_ALLOCABLE_MEMORY
   82|       |    // The size of ybf->buffer_alloc.
   83|   516k|    uint64_t alloc_size = frame_size;
   84|       |#if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
   85|       |    // The size of ybf->y_pyramid
   86|       |    if (alloc_pyramid) {
   87|       |      alloc_size += aom_get_pyramid_alloc_size(width, height, use_highbitdepth);
   88|       |      alloc_size += av1_get_corner_list_size();
   89|       |    }
   90|       |#endif  // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
   91|       |    // The decoder may allocate REF_FRAMES frame buffers in the frame buffer
   92|       |    // pool. Bound the total amount of allocated memory as if these REF_FRAMES
   93|       |    // frame buffers were allocated in a single allocation.
   94|   516k|    if (alloc_size > AOM_MAX_ALLOCABLE_MEMORY / REF_FRAMES)
  ------------------
  |  Branch (94:9): [True: 596, False: 516k]
  ------------------
   95|    596|      return AOM_CODEC_MEM_ERROR;
   96|   516k|#endif
   97|       |
   98|   516k|    if (cb != NULL) {
  ------------------
  |  Branch (98:9): [True: 483k, False: 32.4k]
  ------------------
   99|   483k|      const int align_addr_extra_size = 31;
  100|   483k|      const uint64_t external_frame_size = frame_size + align_addr_extra_size;
  101|       |
  102|   483k|      assert(fb != NULL);
  103|       |
  104|   483k|      if (external_frame_size != (size_t)external_frame_size)
  ------------------
  |  Branch (104:11): [True: 0, False: 483k]
  ------------------
  105|      0|        return AOM_CODEC_MEM_ERROR;
  106|       |
  107|       |      // Allocation to hold larger frame, or first allocation.
  108|   483k|      if (cb(cb_priv, (size_t)external_frame_size, fb) < 0)
  ------------------
  |  Branch (108:11): [True: 0, False: 483k]
  ------------------
  109|      0|        return AOM_CODEC_MEM_ERROR;
  110|       |
  111|   483k|      if (fb->data == NULL || fb->size < external_frame_size)
  ------------------
  |  Branch (111:11): [True: 0, False: 483k]
  |  Branch (111:31): [True: 0, False: 483k]
  ------------------
  112|      0|        return AOM_CODEC_MEM_ERROR;
  113|       |
  114|   483k|      ybf->buffer_alloc = (uint8_t *)aom_align_addr(fb->data, 32);
  ------------------
  |  |   49|   483k|  (void *)(((uintptr_t)(addr) + ((align)-1)) & ~(uintptr_t)((align)-1))
  ------------------
  115|       |
  116|   483k|#if defined(__has_feature)
  117|       |#if __has_feature(memory_sanitizer)
  118|       |      // This memset is needed for fixing the issue of using uninitialized
  119|       |      // value in msan test. It will cause a perf loss, so only do this for
  120|       |      // msan test.
  121|       |      memset(ybf->buffer_alloc, 0, (size_t)frame_size);
  122|       |#endif
  123|   483k|#endif
  124|   483k|    } else if (frame_size > ybf->buffer_alloc_sz) {
  ------------------
  |  Branch (124:16): [True: 13.3k, False: 19.1k]
  ------------------
  125|       |      // Allocation to hold larger frame, or first allocation.
  126|  13.3k|      aom_free(ybf->buffer_alloc);
  127|  13.3k|      ybf->buffer_alloc = NULL;
  128|  13.3k|      ybf->buffer_alloc_sz = 0;
  129|       |
  130|  13.3k|      if (frame_size != (size_t)frame_size) return AOM_CODEC_MEM_ERROR;
  ------------------
  |  Branch (130:11): [True: 0, False: 13.3k]
  ------------------
  131|       |
  132|  13.3k|      ybf->buffer_alloc = (uint8_t *)aom_memalign(32, (size_t)frame_size);
  133|  13.3k|      if (!ybf->buffer_alloc) return AOM_CODEC_MEM_ERROR;
  ------------------
  |  Branch (133:11): [True: 0, False: 13.3k]
  ------------------
  134|       |
  135|  13.3k|      ybf->buffer_alloc_sz = (size_t)frame_size;
  136|       |
  137|       |      // This memset is needed for fixing valgrind error from C loop filter
  138|       |      // due to access uninitialized memory in frame border. It could be
  139|       |      // removed if border is totally removed.
  140|  13.3k|      memset(ybf->buffer_alloc, 0, ybf->buffer_alloc_sz);
  141|  13.3k|    }
  142|       |
  143|   516k|    ybf->y_crop_width = width;
  144|   516k|    ybf->y_crop_height = height;
  145|   516k|    ybf->y_width = aligned_width;
  146|   516k|    ybf->y_height = aligned_height;
  147|   516k|    ybf->y_stride = y_stride;
  148|       |
  149|   516k|    ybf->uv_crop_width = (width + ss_x) >> ss_x;
  150|   516k|    ybf->uv_crop_height = (height + ss_y) >> ss_y;
  151|   516k|    ybf->uv_width = uv_width;
  152|   516k|    ybf->uv_height = uv_height;
  153|   516k|    ybf->uv_stride = uv_stride;
  154|       |
  155|   516k|    ybf->border = border;
  156|   516k|    ybf->frame_size = (size_t)frame_size;
  157|   516k|    ybf->subsampling_x = ss_x;
  158|   516k|    ybf->subsampling_y = ss_y;
  159|       |
  160|   516k|    buf = ybf->buffer_alloc;
  161|   516k|    if (use_highbitdepth) {
  ------------------
  |  Branch (161:9): [True: 208k, False: 307k]
  ------------------
  162|       |      // Store uint16 addresses when using 16bit framebuffers
  163|   208k|      buf = CONVERT_TO_BYTEPTR(ybf->buffer_alloc);
  ------------------
  |  |   76|   208k|#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
  ------------------
  164|   208k|      ybf->flags = YV12_FLAG_HIGHBITDEPTH;
  ------------------
  |  |  142|   208k|#define YV12_FLAG_HIGHBITDEPTH 8
  ------------------
  165|   307k|    } else {
  166|   307k|      ybf->flags = 0;
  167|   307k|    }
  168|       |
  169|   516k|    ybf->y_buffer = (uint8_t *)aom_align_addr(
  ------------------
  |  |   49|   516k|  (void *)(((uintptr_t)(addr) + ((align)-1)) & ~(uintptr_t)((align)-1))
  ------------------
  170|   516k|        buf + (border * y_stride) + border, aom_byte_align);
  171|   516k|    if (!alloc_y_plane_only) {
  ------------------
  |  Branch (171:9): [True: 516k, False: 0]
  ------------------
  172|   516k|      ybf->u_buffer = (uint8_t *)aom_align_addr(
  ------------------
  |  |   49|   516k|  (void *)(((uintptr_t)(addr) + ((align)-1)) & ~(uintptr_t)((align)-1))
  ------------------
  173|   516k|          buf + yplane_size + (uv_border_h * uv_stride) + uv_border_w,
  174|   516k|          aom_byte_align);
  175|   516k|      ybf->v_buffer =
  176|   516k|          (uint8_t *)aom_align_addr(buf + yplane_size + uvplane_size +
  ------------------
  |  |   49|   516k|  (void *)(((uintptr_t)(addr) + ((align)-1)) & ~(uintptr_t)((align)-1))
  ------------------
  177|   516k|                                        (uv_border_h * uv_stride) + uv_border_w,
  178|   516k|                                    aom_byte_align);
  179|   516k|    } else {
  180|      0|      ybf->u_buffer = NULL;
  181|      0|      ybf->v_buffer = NULL;
  182|      0|    }
  183|       |
  184|   516k|    ybf->use_external_reference_buffers = 0;
  185|       |
  186|       |#if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
  187|       |    if (ybf->y_pyramid) {
  188|       |      aom_free_pyramid(ybf->y_pyramid);
  189|       |      ybf->y_pyramid = NULL;
  190|       |    }
  191|       |    if (ybf->corners) {
  192|       |      av1_free_corner_list(ybf->corners);
  193|       |      ybf->corners = NULL;
  194|       |    }
  195|       |    if (alloc_pyramid) {
  196|       |      ybf->y_pyramid = aom_alloc_pyramid(width, height, use_highbitdepth);
  197|       |      if (!ybf->y_pyramid) return AOM_CODEC_MEM_ERROR;
  198|       |      ybf->corners = av1_alloc_corner_list();
  199|       |      if (!ybf->corners) return AOM_CODEC_MEM_ERROR;
  200|       |    }
  201|       |#endif  // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
  202|       |
  203|   516k|    ybf->corrupted = 0; /* assume not corrupted by errors */
  204|   516k|    return 0;
  205|   516k|  }
  206|      0|  return AOM_CODEC_MEM_ERROR;
  207|   516k|}

aom_yv12_extend_frame_borders_c:
  148|  9.88k|                                     const int num_planes) {
  149|  9.88k|  assert(ybf->border % 2 == 0);
  150|  9.88k|  assert(ybf->y_height - ybf->y_crop_height < 16);
  151|  9.88k|  assert(ybf->y_width - ybf->y_crop_width < 16);
  152|  9.88k|  assert(ybf->y_height - ybf->y_crop_height >= 0);
  153|  9.88k|  assert(ybf->y_width - ybf->y_crop_width >= 0);
  154|       |
  155|  9.88k|#if CONFIG_AV1_HIGHBITDEPTH
  156|  9.88k|  if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
  ------------------
  |  |  142|  9.88k|#define YV12_FLAG_HIGHBITDEPTH 8
  ------------------
  |  Branch (156:7): [True: 7.43k, False: 2.44k]
  ------------------
  157|  18.1k|    for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (157:25): [True: 10.7k, False: 7.43k]
  ------------------
  158|  10.7k|      const int is_uv = plane > 0;
  159|  10.7k|      const int plane_border = ybf->border >> is_uv;
  160|  10.7k|      extend_plane_high(
  161|  10.7k|          ybf->buffers[plane], ybf->strides[is_uv], ybf->crop_widths[is_uv],
  162|  10.7k|          ybf->crop_heights[is_uv], plane_border, plane_border,
  163|  10.7k|          plane_border + ybf->heights[is_uv] - ybf->crop_heights[is_uv],
  164|  10.7k|          plane_border + ybf->widths[is_uv] - ybf->crop_widths[is_uv], 0,
  165|  10.7k|          ybf->crop_heights[is_uv]);
  166|  10.7k|    }
  167|  7.43k|    return;
  168|  7.43k|  }
  169|  2.44k|#endif
  170|       |
  171|  9.17k|  for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (171:23): [True: 6.72k, False: 2.44k]
  ------------------
  172|  6.72k|    const int is_uv = plane > 0;
  173|  6.72k|    const int plane_border = ybf->border >> is_uv;
  174|  6.72k|    extend_plane(ybf->buffers[plane], ybf->strides[is_uv],
  175|  6.72k|                 ybf->crop_widths[is_uv], ybf->crop_heights[is_uv],
  176|  6.72k|                 plane_border, plane_border,
  177|  6.72k|                 plane_border + ybf->heights[is_uv] - ybf->crop_heights[is_uv],
  178|  6.72k|                 plane_border + ybf->widths[is_uv] - ybf->crop_widths[is_uv], 0,
  179|  6.72k|                 ybf->crop_heights[is_uv]);
  180|  6.72k|  }
  181|  2.44k|}
aom_extend_frame_borders_c:
  221|  9.88k|void aom_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf, const int num_planes) {
  222|  9.88k|  extend_frame(ybf, ybf->border, num_planes);
  223|  9.88k|}
aom_yv12_copy_frame_c:
  237|  9.88k|                           YV12_BUFFER_CONFIG *dst_bc, const int num_planes) {
  238|  9.88k|  assert(src_bc->y_width == dst_bc->y_width);
  239|  9.88k|  assert(src_bc->y_height == dst_bc->y_height);
  240|       |
  241|  9.88k|#if CONFIG_AV1_HIGHBITDEPTH
  242|  9.88k|  assert((src_bc->flags & YV12_FLAG_HIGHBITDEPTH) ==
  243|  9.88k|         (dst_bc->flags & YV12_FLAG_HIGHBITDEPTH));
  244|       |
  245|  9.88k|  if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
  ------------------
  |  |  142|  9.88k|#define YV12_FLAG_HIGHBITDEPTH 8
  ------------------
  |  Branch (245:7): [True: 7.43k, False: 2.44k]
  ------------------
  246|  18.1k|    for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (246:25): [True: 10.7k, False: 7.43k]
  ------------------
  247|  10.7k|      const uint8_t *plane_src = src_bc->buffers[plane];
  248|  10.7k|      uint8_t *plane_dst = dst_bc->buffers[plane];
  249|  10.7k|      const int is_uv = plane > 0;
  250|       |
  251|  1.53M|      for (int row = 0; row < src_bc->heights[is_uv]; ++row) {
  ------------------
  |  Branch (251:25): [True: 1.52M, False: 10.7k]
  ------------------
  252|  1.52M|        memcpy_short_addr(plane_dst, plane_src, src_bc->widths[is_uv]);
  253|  1.52M|        plane_src += src_bc->strides[is_uv];
  254|  1.52M|        plane_dst += dst_bc->strides[is_uv];
  255|  1.52M|      }
  256|  10.7k|    }
  257|  7.43k|    aom_yv12_extend_frame_borders_c(dst_bc, num_planes);
  258|  7.43k|    return;
  259|  7.43k|  }
  260|  2.44k|#endif
  261|  9.17k|  for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (261:23): [True: 6.72k, False: 2.44k]
  ------------------
  262|  6.72k|    const uint8_t *plane_src = src_bc->buffers[plane];
  263|  6.72k|    uint8_t *plane_dst = dst_bc->buffers[plane];
  264|  6.72k|    const int is_uv = plane > 0;
  265|       |
  266|   590k|    for (int row = 0; row < src_bc->heights[is_uv]; ++row) {
  ------------------
  |  Branch (266:23): [True: 584k, False: 6.72k]
  ------------------
  267|   584k|      memcpy(plane_dst, plane_src, src_bc->widths[is_uv]);
  268|   584k|      plane_src += src_bc->strides[is_uv];
  269|   584k|      plane_dst += dst_bc->strides[is_uv];
  270|   584k|    }
  271|  6.72k|  }
  272|  2.44k|  aom_yv12_extend_frame_borders_c(dst_bc, num_planes);
  273|  2.44k|}
aom_yv12_partial_copy_y_c:
  358|  29.0k|                               int vstart2) {
  359|  29.0k|  int row;
  360|  29.0k|  const uint8_t *src = src_ybc->y_buffer;
  361|  29.0k|  uint8_t *dst = dst_ybc->y_buffer;
  362|  29.0k|#if CONFIG_AV1_HIGHBITDEPTH
  363|  29.0k|  if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) {
  ------------------
  |  |  142|  29.0k|#define YV12_FLAG_HIGHBITDEPTH 8
  ------------------
  |  Branch (363:7): [True: 17.6k, False: 11.3k]
  ------------------
  364|  17.6k|    const uint16_t *src16 =
  365|  17.6k|        CONVERT_TO_SHORTPTR(src + vstart1 * src_ybc->y_stride + hstart1);
  ------------------
  |  |   75|  17.6k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  366|  17.6k|    uint16_t *dst16 =
  367|  17.6k|        CONVERT_TO_SHORTPTR(dst + vstart2 * dst_ybc->y_stride + hstart2);
  ------------------
  |  |   75|  17.6k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  368|       |
  369|  2.13M|    for (row = vstart1; row < vend1; ++row) {
  ------------------
  |  Branch (369:25): [True: 2.11M, False: 17.6k]
  ------------------
  370|  2.11M|      memcpy(dst16, src16, (hend1 - hstart1) * sizeof(uint16_t));
  371|  2.11M|      src16 += src_ybc->y_stride;
  372|  2.11M|      dst16 += dst_ybc->y_stride;
  373|  2.11M|    }
  374|  17.6k|    return;
  375|  17.6k|  }
  376|  11.3k|#endif
  377|  11.3k|  src = (src + vstart1 * src_ybc->y_stride + hstart1);
  378|  11.3k|  dst = (dst + vstart2 * dst_ybc->y_stride + hstart2);
  379|       |
  380|  1.37M|  for (row = vstart1; row < vend1; ++row) {
  ------------------
  |  Branch (380:23): [True: 1.35M, False: 11.3k]
  ------------------
  381|  1.35M|    memcpy(dst, src, (hend1 - hstart1));
  382|  1.35M|    src += src_ybc->y_stride;
  383|  1.35M|    dst += dst_ybc->y_stride;
  384|  1.35M|  }
  385|  11.3k|}
aom_yv12_partial_coloc_copy_y_c:
  389|  29.0k|                                     int hend, int vstart, int vend) {
  390|  29.0k|  aom_yv12_partial_copy_y_c(src_ybc, hstart, hend, vstart, vend, dst_ybc,
  391|  29.0k|                            hstart, vstart);
  392|  29.0k|}
aom_yv12_partial_copy_u_c:
  397|  24.1k|                               int vstart2) {
  398|  24.1k|  int row;
  399|  24.1k|  const uint8_t *src = src_bc->u_buffer;
  400|  24.1k|  uint8_t *dst = dst_bc->u_buffer;
  401|  24.1k|#if CONFIG_AV1_HIGHBITDEPTH
  402|  24.1k|  if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
  ------------------
  |  |  142|  24.1k|#define YV12_FLAG_HIGHBITDEPTH 8
  ------------------
  |  Branch (402:7): [True: 13.1k, False: 10.9k]
  ------------------
  403|  13.1k|    const uint16_t *src16 =
  404|  13.1k|        CONVERT_TO_SHORTPTR(src + vstart1 * src_bc->uv_stride + hstart1);
  ------------------
  |  |   75|  13.1k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  405|  13.1k|    uint16_t *dst16 =
  406|  13.1k|        CONVERT_TO_SHORTPTR(dst + vstart2 * dst_bc->uv_stride + hstart2);
  ------------------
  |  |   75|  13.1k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  407|   860k|    for (row = vstart1; row < vend1; ++row) {
  ------------------
  |  Branch (407:25): [True: 847k, False: 13.1k]
  ------------------
  408|   847k|      memcpy(dst16, src16, (hend1 - hstart1) * sizeof(uint16_t));
  409|   847k|      src16 += src_bc->uv_stride;
  410|   847k|      dst16 += dst_bc->uv_stride;
  411|   847k|    }
  412|  13.1k|    return;
  413|  13.1k|  }
  414|  10.9k|#endif
  415|  10.9k|  src = (src + vstart1 * src_bc->uv_stride + hstart1);
  416|  10.9k|  dst = (dst + vstart2 * dst_bc->uv_stride + hstart2);
  417|       |
  418|   724k|  for (row = vstart1; row < vend1; ++row) {
  ------------------
  |  Branch (418:23): [True: 713k, False: 10.9k]
  ------------------
  419|   713k|    memcpy(dst, src, (hend1 - hstart1));
  420|   713k|    src += src_bc->uv_stride;
  421|   713k|    dst += dst_bc->uv_stride;
  422|   713k|  }
  423|  10.9k|}
aom_yv12_partial_coloc_copy_u_c:
  427|  24.1k|                                     int hend, int vstart, int vend) {
  428|  24.1k|  aom_yv12_partial_copy_u_c(src_bc, hstart, hend, vstart, vend, dst_bc, hstart,
  429|  24.1k|                            vstart);
  430|  24.1k|}
aom_yv12_partial_copy_v_c:
  435|  9.73k|                               int vstart2) {
  436|  9.73k|  int row;
  437|  9.73k|  const uint8_t *src = src_bc->v_buffer;
  438|  9.73k|  uint8_t *dst = dst_bc->v_buffer;
  439|  9.73k|#if CONFIG_AV1_HIGHBITDEPTH
  440|  9.73k|  if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
  ------------------
  |  |  142|  9.73k|#define YV12_FLAG_HIGHBITDEPTH 8
  ------------------
  |  Branch (440:7): [True: 4.72k, False: 5.00k]
  ------------------
  441|  4.72k|    const uint16_t *src16 =
  442|  4.72k|        CONVERT_TO_SHORTPTR(src + vstart1 * src_bc->uv_stride + hstart1);
  ------------------
  |  |   75|  4.72k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  443|  4.72k|    uint16_t *dst16 =
  444|  4.72k|        CONVERT_TO_SHORTPTR(dst + vstart2 * dst_bc->uv_stride + hstart2);
  ------------------
  |  |   75|  4.72k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  445|   412k|    for (row = vstart1; row < vend1; ++row) {
  ------------------
  |  Branch (445:25): [True: 407k, False: 4.72k]
  ------------------
  446|   407k|      memcpy(dst16, src16, (hend1 - hstart1) * sizeof(uint16_t));
  447|   407k|      src16 += src_bc->uv_stride;
  448|   407k|      dst16 += dst_bc->uv_stride;
  449|   407k|    }
  450|  4.72k|    return;
  451|  4.72k|  }
  452|  5.00k|#endif
  453|  5.00k|  src = (src + vstart1 * src_bc->uv_stride + hstart1);
  454|  5.00k|  dst = (dst + vstart2 * dst_bc->uv_stride + hstart2);
  455|       |
  456|   289k|  for (row = vstart1; row < vend1; ++row) {
  ------------------
  |  Branch (456:23): [True: 284k, False: 5.00k]
  ------------------
  457|   284k|    memcpy(dst, src, (hend1 - hstart1));
  458|   284k|    src += src_bc->uv_stride;
  459|   284k|    dst += dst_bc->uv_stride;
  460|   284k|  }
  461|  5.00k|}
aom_yv12_partial_coloc_copy_v_c:
  465|  9.73k|                                     int hend, int vstart, int vend) {
  466|  9.73k|  aom_yv12_partial_copy_v_c(src_bc, hstart, hend, vstart, vend, dst_bc, hstart,
  467|  9.73k|                            vstart);
  468|  9.73k|}
yv12extend.c:extend_plane_high:
   70|  21.4k|                              int v_end) {
   71|  21.4k|  int i;
   72|  21.4k|  const int linesize = extend_left + extend_right + width;
   73|  21.4k|  assert(linesize <= src_stride);
   74|  21.4k|  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  ------------------
  |  |   75|  21.4k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
   75|       |
   76|       |  /* copy the left and right most columns out */
   77|  21.4k|  uint16_t *src_ptr1 = src + v_start * src_stride;
   78|  21.4k|  uint16_t *src_ptr2 = src + v_start * src_stride + width - 1;
   79|  21.4k|  uint16_t *dst_ptr1 = src + v_start * src_stride - extend_left;
   80|  21.4k|  uint16_t *dst_ptr2 = src_ptr2 + 1;
   81|       |
   82|  2.99M|  for (i = v_start; i < v_end; ++i) {
  ------------------
  |  Branch (82:21): [True: 2.97M, False: 21.4k]
  ------------------
   83|  2.97M|    aom_memset16(dst_ptr1, src_ptr1[0], extend_left);
   84|  2.97M|    aom_memset16(dst_ptr2, src_ptr2[0], extend_right);
   85|  2.97M|    src_ptr1 += src_stride;
   86|  2.97M|    src_ptr2 += src_stride;
   87|  2.97M|    dst_ptr1 += src_stride;
   88|  2.97M|    dst_ptr2 += src_stride;
   89|  2.97M|  }
   90|       |
   91|       |  /* Now copy the top and bottom lines into each line of the respective
   92|       |   * borders
   93|       |   */
   94|  21.4k|  src_ptr1 = src - extend_left;
   95|  21.4k|  dst_ptr1 = src_ptr1 + src_stride * -extend_top;
   96|       |
   97|  5.40M|  for (i = 0; i < extend_top; ++i) {
  ------------------
  |  Branch (97:15): [True: 5.38M, False: 21.4k]
  ------------------
   98|  5.38M|    memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t));
   99|  5.38M|    dst_ptr1 += src_stride;
  100|  5.38M|  }
  101|       |
  102|  21.4k|  src_ptr2 = src_ptr1 + src_stride * (height - 1);
  103|  21.4k|  dst_ptr2 = src_ptr2;
  104|       |
  105|  5.47M|  for (i = 0; i < extend_bottom; ++i) {
  ------------------
  |  Branch (105:15): [True: 5.45M, False: 21.4k]
  ------------------
  106|  5.45M|    dst_ptr2 += src_stride;
  107|  5.45M|    memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t));
  108|  5.45M|  }
  109|  21.4k|}
yv12extend.c:extend_plane:
   25|  13.4k|                         int v_end) {
   26|  13.4k|  assert(src != NULL);
   27|  13.4k|  int i;
   28|  13.4k|  const int linesize = extend_left + extend_right + width;
   29|  13.4k|  assert(linesize <= src_stride);
   30|       |
   31|       |  /* copy the left and right most columns out */
   32|  13.4k|  uint8_t *src_ptr1 = src + v_start * src_stride;
   33|  13.4k|  uint8_t *src_ptr2 = src + v_start * src_stride + width - 1;
   34|  13.4k|  uint8_t *dst_ptr1 = src + v_start * src_stride - extend_left;
   35|  13.4k|  uint8_t *dst_ptr2 = src_ptr2 + 1;
   36|       |
   37|  1.12M|  for (i = v_start; i < v_end; ++i) {
  ------------------
  |  Branch (37:21): [True: 1.11M, False: 13.4k]
  ------------------
   38|  1.11M|    memset(dst_ptr1, src_ptr1[0], extend_left);
   39|  1.11M|    memset(dst_ptr2, src_ptr2[0], extend_right);
   40|  1.11M|    src_ptr1 += src_stride;
   41|  1.11M|    src_ptr2 += src_stride;
   42|  1.11M|    dst_ptr1 += src_stride;
   43|  1.11M|    dst_ptr2 += src_stride;
   44|  1.11M|  }
   45|       |
   46|       |  /* Now copy the top and bottom lines into each line of the respective
   47|       |   * borders
   48|       |   */
   49|  13.4k|  src_ptr1 = src - extend_left;
   50|  13.4k|  dst_ptr1 = src_ptr1 + src_stride * -extend_top;
   51|       |
   52|  2.79M|  for (i = 0; i < extend_top; ++i) {
  ------------------
  |  Branch (52:15): [True: 2.78M, False: 13.4k]
  ------------------
   53|  2.78M|    memcpy(dst_ptr1, src_ptr1, linesize);
   54|  2.78M|    dst_ptr1 += src_stride;
   55|  2.78M|  }
   56|       |
   57|  13.4k|  src_ptr2 = src_ptr1 + src_stride * (height - 1);
   58|  13.4k|  dst_ptr2 = src_ptr2;
   59|       |
   60|  2.84M|  for (i = 0; i < extend_bottom; ++i) {
  ------------------
  |  Branch (60:15): [True: 2.83M, False: 13.4k]
  ------------------
   61|  2.83M|    dst_ptr2 += src_stride;
   62|  2.83M|    memcpy(dst_ptr2, src_ptr2, linesize);
   63|  2.83M|  }
   64|  13.4k|}
yv12extend.c:extend_frame:
  184|  9.88k|                         const int num_planes) {
  185|  9.88k|  const int ss_x = ybf->subsampling_x;
  186|  9.88k|  const int ss_y = ybf->subsampling_y;
  187|       |
  188|  9.88k|  assert(ybf->y_height - ybf->y_crop_height < 16);
  189|  9.88k|  assert(ybf->y_width - ybf->y_crop_width < 16);
  190|  9.88k|  assert(ybf->y_height - ybf->y_crop_height >= 0);
  191|  9.88k|  assert(ybf->y_width - ybf->y_crop_width >= 0);
  192|       |
  193|  9.88k|#if CONFIG_AV1_HIGHBITDEPTH
  194|  9.88k|  if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
  ------------------
  |  |  142|  9.88k|#define YV12_FLAG_HIGHBITDEPTH 8
  ------------------
  |  Branch (194:7): [True: 7.43k, False: 2.44k]
  ------------------
  195|  18.1k|    for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (195:25): [True: 10.7k, False: 7.43k]
  ------------------
  196|  10.7k|      const int is_uv = plane > 0;
  197|  10.7k|      const int top = ext_size >> (is_uv ? ss_y : 0);
  ------------------
  |  Branch (197:36): [True: 3.27k, False: 7.43k]
  ------------------
  198|  10.7k|      const int left = ext_size >> (is_uv ? ss_x : 0);
  ------------------
  |  Branch (198:37): [True: 3.27k, False: 7.43k]
  ------------------
  199|  10.7k|      const int bottom = top + ybf->heights[is_uv] - ybf->crop_heights[is_uv];
  200|  10.7k|      const int right = left + ybf->widths[is_uv] - ybf->crop_widths[is_uv];
  201|  10.7k|      extend_plane_high(ybf->buffers[plane], ybf->strides[is_uv],
  202|  10.7k|                        ybf->crop_widths[is_uv], ybf->crop_heights[is_uv], top,
  203|  10.7k|                        left, bottom, right, 0, ybf->crop_heights[is_uv]);
  204|  10.7k|    }
  205|  7.43k|    return;
  206|  7.43k|  }
  207|  2.44k|#endif
  208|       |
  209|  9.17k|  for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (209:23): [True: 6.72k, False: 2.44k]
  ------------------
  210|  6.72k|    const int is_uv = plane > 0;
  211|  6.72k|    const int top = ext_size >> (is_uv ? ss_y : 0);
  ------------------
  |  Branch (211:34): [True: 4.28k, False: 2.44k]
  ------------------
  212|  6.72k|    const int left = ext_size >> (is_uv ? ss_x : 0);
  ------------------
  |  Branch (212:35): [True: 4.28k, False: 2.44k]
  ------------------
  213|  6.72k|    const int bottom = top + ybf->heights[is_uv] - ybf->crop_heights[is_uv];
  214|  6.72k|    const int right = left + ybf->widths[is_uv] - ybf->crop_widths[is_uv];
  215|  6.72k|    extend_plane(ybf->buffers[plane], ybf->strides[is_uv],
  216|  6.72k|                 ybf->crop_widths[is_uv], ybf->crop_heights[is_uv], top, left,
  217|  6.72k|                 bottom, right, 0, ybf->crop_heights[is_uv]);
  218|  6.72k|  }
  219|  2.44k|}
yv12extend.c:memcpy_short_addr:
  226|  1.52M|static void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
  227|  1.52M|  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  ------------------
  |  |   75|  1.52M|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  228|  1.52M|  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  ------------------
  |  |   75|  1.52M|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  229|  1.52M|  memcpy(dst, src, num * sizeof(uint16_t));
  230|  1.52M|}

yv12config.c:aom_calc_y_stride:
  215|   516k|static inline int aom_calc_y_stride(int aligned_width, int border) {
  216|   516k|  return ((aligned_width + 2 * border) + 31) & ~31;
  217|   516k|}

aom_get_worker_interface:
  242|  1.40M|const AVxWorkerInterface *aom_get_worker_interface(void) {
  243|  1.40M|  return &g_worker_interface;
  244|  1.40M|}
aom_thread.c:init:
  123|   161k|static void init(AVxWorker *const worker) {
  124|   161k|  memset(worker, 0, sizeof(*worker));
  125|   161k|  worker->status_ = AVX_WORKER_STATUS_NOT_OK;
  126|   161k|}
aom_thread.c:reset:
  136|   125k|static int reset(AVxWorker *const worker) {
  137|   125k|  int ok = 1;
  138|   125k|  worker->had_error = 0;
  139|   125k|  if (worker->status_ < AVX_WORKER_STATUS_OK) {
  ------------------
  |  Branch (139:7): [True: 125k, False: 0]
  ------------------
  140|   125k|#if CONFIG_MULTITHREAD
  141|   125k|    worker->impl_ = (AVxWorkerImpl *)aom_calloc(1, sizeof(*worker->impl_));
  142|   125k|    if (worker->impl_ == NULL) {
  ------------------
  |  Branch (142:9): [True: 0, False: 125k]
  ------------------
  143|      0|      return 0;
  144|      0|    }
  145|   125k|    if (pthread_mutex_init(&worker->impl_->mutex_, NULL)) {
  ------------------
  |  Branch (145:9): [True: 0, False: 125k]
  ------------------
  146|      0|      goto Error;
  147|      0|    }
  148|   125k|    if (pthread_cond_init(&worker->impl_->condition_, NULL)) {
  ------------------
  |  Branch (148:9): [True: 0, False: 125k]
  ------------------
  149|      0|      pthread_mutex_destroy(&worker->impl_->mutex_);
  150|      0|      goto Error;
  151|      0|    }
  152|   125k|    pthread_attr_t attr;
  153|   125k|    if (pthread_attr_init(&attr)) goto Error2;
  ------------------
  |  Branch (153:9): [True: 0, False: 125k]
  ------------------
  154|       |      // Debug ASan builds require at least ~1MiB of stack; prevents
  155|       |      // failures on macOS arm64 where the default is 512KiB.
  156|       |      // See: https://crbug.com/aomedia/3379
  157|       |#if defined(AOM_ADDRESS_SANITIZER) && defined(__APPLE__) && AOM_ARCH_ARM && \
  158|       |    !defined(NDEBUG)
  159|       |    const size_t kMinStackSize = 1024 * 1024;
  160|       |#else
  161|   125k|    const size_t kMinStackSize = 256 * 1024;
  162|   125k|#endif
  163|   125k|    size_t stacksize;
  164|   125k|    if (!pthread_attr_getstacksize(&attr, &stacksize)) {
  ------------------
  |  Branch (164:9): [True: 125k, False: 0]
  ------------------
  165|   125k|      if (stacksize < kMinStackSize &&
  ------------------
  |  Branch (165:11): [True: 0, False: 125k]
  ------------------
  166|   125k|          pthread_attr_setstacksize(&attr, kMinStackSize)) {
  ------------------
  |  Branch (166:11): [True: 0, False: 0]
  ------------------
  167|      0|        pthread_attr_destroy(&attr);
  168|      0|        goto Error2;
  169|      0|      }
  170|   125k|    }
  171|   125k|    pthread_mutex_lock(&worker->impl_->mutex_);
  172|   125k|    ok = !pthread_create(&worker->impl_->thread_, &attr, thread_loop, worker);
  173|   125k|    if (ok) worker->status_ = AVX_WORKER_STATUS_OK;
  ------------------
  |  Branch (173:9): [True: 125k, False: 0]
  ------------------
  174|   125k|    pthread_mutex_unlock(&worker->impl_->mutex_);
  175|   125k|    pthread_attr_destroy(&attr);
  176|   125k|    if (!ok) {
  ------------------
  |  Branch (176:9): [True: 0, False: 125k]
  ------------------
  177|      0|    Error2:
  178|      0|      pthread_mutex_destroy(&worker->impl_->mutex_);
  179|      0|      pthread_cond_destroy(&worker->impl_->condition_);
  180|      0|    Error:
  181|      0|      aom_free(worker->impl_);
  182|      0|      worker->impl_ = NULL;
  183|      0|      return 0;
  184|      0|    }
  185|       |#else
  186|       |    worker->status_ = AVX_WORKER_STATUS_OK;
  187|       |#endif
  188|   125k|  } else if (worker->status_ > AVX_WORKER_STATUS_OK) {
  ------------------
  |  Branch (188:14): [True: 0, False: 0]
  ------------------
  189|      0|    ok = sync(worker);
  190|      0|  }
  191|   125k|  assert(!ok || (worker->status_ == AVX_WORKER_STATUS_OK));
  192|   125k|  return ok;
  193|   125k|}
aom_thread.c:thread_loop:
   45|   125k|static THREADFN thread_loop(void *ptr) {
   46|   125k|  AVxWorker *const worker = (AVxWorker *)ptr;
   47|       |#ifdef __APPLE__
   48|       |  if (worker->thread_name != NULL) {
   49|       |    // Apple's version of pthread_setname_np takes one argument and operates on
   50|       |    // the current thread only. The maximum size of the thread_name buffer was
   51|       |    // noted in the Chromium source code and was confirmed by experiments. If
   52|       |    // thread_name is too long, pthread_setname_np returns -1 with errno
   53|       |    // ENAMETOOLONG (63).
   54|       |    char thread_name[64];
   55|       |    strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1);
   56|       |    thread_name[sizeof(thread_name) - 1] = '\0';
   57|       |    pthread_setname_np(thread_name);
   58|       |  }
   59|       |#elif (defined(__GLIBC__) && !defined(__GNU__)) || defined(__BIONIC__)
   60|   125k|  if (worker->thread_name != NULL) {
  ------------------
  |  Branch (60:7): [True: 125k, False: 3]
  ------------------
   61|       |    // Linux and Android require names (with nul) fit in 16 chars, otherwise
   62|       |    // pthread_setname_np() returns ERANGE (34).
   63|   125k|    char thread_name[16];
   64|   125k|    strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1);
   65|   125k|    thread_name[sizeof(thread_name) - 1] = '\0';
   66|   125k|    pthread_setname_np(pthread_self(), thread_name);
   67|   125k|  }
   68|   125k|#endif
   69|   125k|  pthread_mutex_lock(&worker->impl_->mutex_);
   70|  2.44M|  for (;;) {
   71|  4.89M|    while (worker->status_ == AVX_WORKER_STATUS_OK) {  // wait in idling mode
  ------------------
  |  Branch (71:12): [True: 2.44M, False: 2.44M]
  ------------------
   72|  2.44M|      pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
   73|  2.44M|    }
   74|  2.44M|    if (worker->status_ == AVX_WORKER_STATUS_WORKING) {
  ------------------
  |  Branch (74:9): [True: 2.32M, False: 125k]
  ------------------
   75|       |      // When worker->status_ is AVX_WORKER_STATUS_WORKING, the main thread
   76|       |      // doesn't change worker->status_ and will wait until the worker changes
   77|       |      // worker->status_ to AVX_WORKER_STATUS_OK. See change_state(). So the
   78|       |      // worker can safely call execute() without holding worker->impl_->mutex_.
   79|       |      // When the worker reacquires worker->impl_->mutex_, worker->status_ must
   80|       |      // still be AVX_WORKER_STATUS_WORKING.
   81|  2.32M|      pthread_mutex_unlock(&worker->impl_->mutex_);
   82|  2.32M|      execute(worker);
   83|  2.32M|      pthread_mutex_lock(&worker->impl_->mutex_);
   84|  2.32M|      assert(worker->status_ == AVX_WORKER_STATUS_WORKING);
   85|  2.32M|      worker->status_ = AVX_WORKER_STATUS_OK;
   86|       |      // signal to the main thread that we're done (for sync())
   87|  2.32M|      pthread_cond_signal(&worker->impl_->condition_);
   88|  2.32M|    } else {
   89|   125k|      assert(worker->status_ == AVX_WORKER_STATUS_NOT_OK);  // finish the worker
   90|   125k|      break;
   91|   125k|    }
   92|  2.44M|  }
   93|   127k|  pthread_mutex_unlock(&worker->impl_->mutex_);
   94|   127k|  return THREAD_EXIT_SUCCESS;  // Thread is finished
  ------------------
  |  |  176|   127k|#define THREAD_EXIT_SUCCESS NULL
  ------------------
   95|   125k|}
aom_thread.c:sync:
  128|  4.71M|static int sync(AVxWorker *const worker) {
  129|  4.71M|#if CONFIG_MULTITHREAD
  130|  4.71M|  change_state(worker, AVX_WORKER_STATUS_OK);
  131|  4.71M|#endif
  132|  4.71M|  assert(worker->status_ <= AVX_WORKER_STATUS_OK);
  133|  4.71M|  return !worker->had_error;
  134|  4.71M|}
aom_thread.c:change_state:
   98|  7.15M|static void change_state(AVxWorker *const worker, AVxWorkerStatus new_status) {
   99|       |  // No-op when attempting to change state on a thread that didn't come up.
  100|       |  // Checking status_ without acquiring the lock first would result in a data
  101|       |  // race.
  102|  7.15M|  if (worker->impl_ == NULL) return;
  ------------------
  |  Branch (102:7): [True: 680k, False: 6.47M]
  ------------------
  103|       |
  104|  6.47M|  pthread_mutex_lock(&worker->impl_->mutex_);
  105|  6.47M|  if (worker->status_ >= AVX_WORKER_STATUS_OK) {
  ------------------
  |  Branch (105:7): [True: 6.47M, False: 0]
  ------------------
  106|       |    // wait for the worker to finish
  107|  6.57M|    while (worker->status_ != AVX_WORKER_STATUS_OK) {
  ------------------
  |  Branch (107:12): [True: 98.7k, False: 6.47M]
  ------------------
  108|  98.7k|      pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
  109|  98.7k|    }
  110|       |    // assign new status and release the working thread if needed
  111|  6.47M|    if (new_status != AVX_WORKER_STATUS_OK) {
  ------------------
  |  Branch (111:9): [True: 2.44M, False: 4.02M]
  ------------------
  112|  2.44M|      worker->status_ = new_status;
  113|  2.44M|      pthread_cond_signal(&worker->impl_->condition_);
  114|  2.44M|    }
  115|  6.47M|  }
  116|  6.47M|  pthread_mutex_unlock(&worker->impl_->mutex_);
  117|  6.47M|}
aom_thread.c:launch:
  201|  2.32M|static void launch(AVxWorker *const worker) {
  202|  2.32M|#if CONFIG_MULTITHREAD
  203|  2.32M|  change_state(worker, AVX_WORKER_STATUS_WORKING);
  204|       |#else
  205|       |  execute(worker);
  206|       |#endif
  207|  2.32M|}
aom_thread.c:execute:
  195|  2.75M|static void execute(AVxWorker *const worker) {
  196|  2.75M|  if (worker->hook != NULL) {
  ------------------
  |  Branch (196:7): [True: 2.75M, False: 1.62k]
  ------------------
  197|  2.75M|    worker->had_error |= !worker->hook(worker->data1, worker->data2);
  198|  2.75M|  }
  199|  2.75M|}
aom_thread.c:end:
  209|   161k|static void end(AVxWorker *const worker) {
  210|   161k|#if CONFIG_MULTITHREAD
  211|   161k|  if (worker->impl_ != NULL) {
  ------------------
  |  Branch (211:7): [True: 125k, False: 35.7k]
  ------------------
  212|   125k|    change_state(worker, AVX_WORKER_STATUS_NOT_OK);
  213|   125k|    pthread_join(worker->impl_->thread_, NULL);
  214|   125k|    pthread_mutex_destroy(&worker->impl_->mutex_);
  215|   125k|    pthread_cond_destroy(&worker->impl_->condition_);
  216|   125k|    aom_free(worker->impl_);
  217|   125k|    worker->impl_ = NULL;
  218|   125k|  }
  219|       |#else
  220|       |  worker->status_ = AVX_WORKER_STATUS_NOT_OK;
  221|       |  assert(worker->impl_ == NULL);
  222|       |#endif
  223|   161k|  assert(worker->status_ == AVX_WORKER_STATUS_NOT_OK);
  224|   161k|}

aom_codec_av1_dx:
 1786|  16.1k|aom_codec_iface_t *aom_codec_av1_dx(void) { return &aom_codec_av1_dx_algo; }
av1_dx_iface.c:decoder_init:
   86|  16.1k|static aom_codec_err_t decoder_init(aom_codec_ctx_t *ctx) {
   87|       |  // This function only allocates space for the aom_codec_alg_priv_t
   88|       |  // structure. More memory may be required at the time the stream
   89|       |  // information becomes known.
   90|  16.1k|  if (!ctx->priv) {
  ------------------
  |  Branch (90:7): [True: 16.1k, False: 0]
  ------------------
   91|  16.1k|    aom_codec_alg_priv_t *const priv =
   92|  16.1k|        (aom_codec_alg_priv_t *)aom_calloc(1, sizeof(*priv));
   93|  16.1k|    if (priv == NULL) return AOM_CODEC_MEM_ERROR;
  ------------------
  |  Branch (93:9): [True: 0, False: 16.1k]
  ------------------
   94|       |
   95|  16.1k|    ctx->priv = (aom_codec_priv_t *)priv;
   96|  16.1k|    ctx->priv->init_flags = ctx->init_flags;
   97|  16.1k|    priv->flushed = 0;
   98|       |
   99|       |    // TODO(tdaede): this should not be exposed to the API
  100|  16.1k|    priv->cfg.allow_lowbitdepth = !FORCE_HIGHBITDEPTH_DECODING;
  ------------------
  |  |   79|  16.1k|#define FORCE_HIGHBITDEPTH_DECODING 0
  ------------------
  101|  16.1k|    if (ctx->config.dec) {
  ------------------
  |  Branch (101:9): [True: 16.1k, False: 0]
  ------------------
  102|  16.1k|      priv->cfg = *ctx->config.dec;
  103|  16.1k|      ctx->config.dec = &priv->cfg;
  104|  16.1k|    }
  105|  16.1k|    priv->num_grain_image_frame_buffers = 0;
  106|       |    // Turn row_mt on by default.
  107|  16.1k|    priv->row_mt = 1;
  108|       |
  109|       |    // Turn on normal tile coding mode by default.
  110|       |    // 0 is for normal tile coding mode, and 1 is for large scale tile coding
  111|       |    // mode(refer to lightfield example).
  112|  16.1k|    priv->tile_mode = 0;
  113|  16.1k|    priv->decode_tile_row = -1;
  114|  16.1k|    priv->decode_tile_col = -1;
  115|  16.1k|  }
  116|       |
  117|  16.1k|  return AOM_CODEC_OK;
  118|  16.1k|}
av1_dx_iface.c:decoder_destroy:
  120|  16.1k|static aom_codec_err_t decoder_destroy(aom_codec_alg_priv_t *ctx) {
  121|  16.1k|  if (ctx->frame_worker != NULL) {
  ------------------
  |  Branch (121:7): [True: 16.1k, False: 80]
  ------------------
  122|  16.1k|    AVxWorker *const worker = ctx->frame_worker;
  123|  16.1k|    aom_get_worker_interface()->end(worker);
  124|  16.1k|    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
  125|  16.1k|    if (frame_worker_data != NULL && frame_worker_data->pbi != NULL) {
  ------------------
  |  Branch (125:9): [True: 16.1k, False: 0]
  |  Branch (125:38): [True: 16.1k, False: 0]
  ------------------
  126|  16.1k|      AV1Decoder *const pbi = frame_worker_data->pbi;
  127|  16.1k|      aom_free(pbi->common.tpl_mvs);
  128|  16.1k|      pbi->common.tpl_mvs = NULL;
  129|  16.1k|      av1_remove_common(&pbi->common);
  130|  16.1k|      av1_free_cdef_buffers(&pbi->common, &pbi->cdef_worker, &pbi->cdef_sync);
  131|  16.1k|      av1_free_cdef_sync(&pbi->cdef_sync);
  132|  16.1k|      av1_free_restoration_buffers(&pbi->common);
  133|  16.1k|      av1_decoder_remove(pbi);
  134|  16.1k|    }
  135|  16.1k|    aom_free(frame_worker_data);
  136|  16.1k|  }
  137|       |
  138|  16.1k|  if (ctx->buffer_pool) {
  ------------------
  |  Branch (138:7): [True: 16.1k, False: 80]
  ------------------
  139|  16.4k|    for (size_t i = 0; i < ctx->num_grain_image_frame_buffers; i++) {
  ------------------
  |  Branch (139:24): [True: 367, False: 16.1k]
  ------------------
  140|    367|      ctx->buffer_pool->release_fb_cb(ctx->buffer_pool->cb_priv,
  141|    367|                                      &ctx->grain_image_frame_buffers[i]);
  142|    367|    }
  143|  16.1k|    av1_free_ref_frame_buffers(ctx->buffer_pool);
  144|  16.1k|    av1_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers);
  145|  16.1k|#if CONFIG_MULTITHREAD
  146|  16.1k|    pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
  147|  16.1k|#endif
  148|  16.1k|  }
  149|       |
  150|  16.1k|  aom_free(ctx->frame_worker);
  151|  16.1k|  aom_free(ctx->buffer_pool);
  152|  16.1k|  assert(!ctx->img.self_allocd);
  153|  16.1k|  aom_img_free(&ctx->img);
  154|  16.1k|  aom_free(ctx);
  155|  16.1k|  return AOM_CODEC_OK;
  156|  16.1k|}
av1_dx_iface.c:ctrl_set_tile_mode:
 1620|  16.1k|                                          va_list args) {
 1621|  16.1k|  ctx->tile_mode = va_arg(args, unsigned int);
 1622|  16.1k|  return AOM_CODEC_OK;
 1623|  16.1k|}
av1_dx_iface.c:ctrl_set_is_annexb:
 1626|  16.1k|                                          va_list args) {
 1627|  16.1k|  ctx->is_annexb = va_arg(args, unsigned int);
 1628|  16.1k|  return AOM_CODEC_OK;
 1629|  16.1k|}
av1_dx_iface.c:ctrl_set_operating_point:
 1632|  16.1k|                                                va_list args) {
 1633|  16.1k|  ctx->operating_point = va_arg(args, int);
 1634|  16.1k|  return AOM_CODEC_OK;
 1635|  16.1k|}
av1_dx_iface.c:ctrl_set_output_all_layers:
 1638|  16.1k|                                                  va_list args) {
 1639|  16.1k|  ctx->output_all_layers = va_arg(args, int);
 1640|  16.1k|  return AOM_CODEC_OK;
 1641|  16.1k|}
av1_dx_iface.c:ctrl_ext_tile_debug:
 1658|  16.1k|                                           va_list args) {
 1659|  16.1k|  ctx->ext_tile_debug = va_arg(args, int);
 1660|  16.1k|  return AOM_CODEC_OK;
 1661|  16.1k|}
av1_dx_iface.c:decoder_peek_si:
  358|   292k|                                       aom_codec_stream_info_t *si) {
  359|   292k|  return decoder_peek_si_internal(data, data_sz, si, NULL);
  360|   292k|}
av1_dx_iface.c:decoder_peek_si_internal:
  262|   313k|                                                int *is_intra_only) {
  263|   313k|  int intra_only_flag = 0;
  264|   313k|  int got_sequence_header = 0;
  265|   313k|  int found_keyframe = 0;
  266|       |
  267|   313k|  if (data + data_sz <= data || data_sz < 1) return AOM_CODEC_INVALID_PARAM;
  ------------------
  |  Branch (267:7): [True: 247, False: 313k]
  |  Branch (267:33): [True: 0, False: 313k]
  ------------------
  268|       |
  269|   313k|  si->w = 0;
  270|   313k|  si->h = 0;
  271|   313k|  si->is_kf = 0;  // is_kf indicates whether the current packet contains a RAP
  272|       |
  273|   313k|  ObuHeader obu_header;
  274|   313k|  memset(&obu_header, 0, sizeof(obu_header));
  275|   313k|  size_t payload_size = 0;
  276|   313k|  size_t bytes_read = 0;
  277|   313k|  uint8_t reduced_still_picture_hdr = 0;
  278|   313k|  aom_codec_err_t status = aom_read_obu_header_and_size(
  279|   313k|      data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read);
  280|   313k|  if (status != AOM_CODEC_OK) return status;
  ------------------
  |  Branch (280:7): [True: 24.1k, False: 288k]
  ------------------
  281|       |
  282|       |  // If the first OBU is a temporal delimiter, skip over it and look at the next
  283|       |  // OBU in the bitstream
  284|   288k|  if (obu_header.type == OBU_TEMPORAL_DELIMITER) {
  ------------------
  |  Branch (284:7): [True: 146k, False: 142k]
  ------------------
  285|       |    // Skip any associated payload (there shouldn't be one, but just in case)
  286|   146k|    if (data_sz < bytes_read + payload_size) return AOM_CODEC_CORRUPT_FRAME;
  ------------------
  |  Branch (286:9): [True: 1.09k, False: 145k]
  ------------------
  287|   145k|    data += bytes_read + payload_size;
  288|   145k|    data_sz -= bytes_read + payload_size;
  289|       |
  290|   145k|    status = aom_read_obu_header_and_size(
  291|   145k|        data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read);
  292|   145k|    if (status != AOM_CODEC_OK) return status;
  ------------------
  |  Branch (292:9): [True: 14.7k, False: 130k]
  ------------------
  293|   145k|  }
  294|   608k|  while (1) {
  ------------------
  |  Branch (294:10): [Folded - Ignored]
  ------------------
  295|   608k|    data += bytes_read;
  296|   608k|    data_sz -= bytes_read;
  297|   608k|    if (data_sz < payload_size) return AOM_CODEC_CORRUPT_FRAME;
  ------------------
  |  Branch (297:9): [True: 4.06k, False: 604k]
  ------------------
  298|       |    // Check that the selected OBU is a sequence header
  299|   604k|    if (obu_header.type == OBU_SEQUENCE_HEADER) {
  ------------------
  |  Branch (299:9): [True: 120k, False: 483k]
  ------------------
  300|       |      // Sanity check on sequence header size
  301|   120k|      if (data_sz < 2) return AOM_CODEC_CORRUPT_FRAME;
  ------------------
  |  Branch (301:11): [True: 563, False: 120k]
  ------------------
  302|       |      // Read a few values from the sequence header payload
  303|   120k|      struct aom_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
  304|       |
  305|   120k|      av1_read_profile(&rb);  // profile
  306|   120k|      const uint8_t still_picture = aom_rb_read_bit(&rb);
  307|   120k|      reduced_still_picture_hdr = aom_rb_read_bit(&rb);
  308|       |
  309|   120k|      if (!still_picture && reduced_still_picture_hdr) {
  ------------------
  |  Branch (309:11): [True: 83.4k, False: 36.9k]
  |  Branch (309:29): [True: 210, False: 83.2k]
  ------------------
  310|    210|        return AOM_CODEC_UNSUP_BITSTREAM;
  311|    210|      }
  312|       |
  313|   120k|      status = parse_operating_points(&rb, reduced_still_picture_hdr, si);
  314|   120k|      if (status != AOM_CODEC_OK) return status;
  ------------------
  |  Branch (314:11): [True: 559, False: 119k]
  ------------------
  315|       |
  316|   119k|      int num_bits_width = aom_rb_read_literal(&rb, 4) + 1;
  317|   119k|      int num_bits_height = aom_rb_read_literal(&rb, 4) + 1;
  318|   119k|      int max_frame_width = aom_rb_read_literal(&rb, num_bits_width) + 1;
  319|   119k|      int max_frame_height = aom_rb_read_literal(&rb, num_bits_height) + 1;
  320|   119k|      si->w = max_frame_width;
  321|   119k|      si->h = max_frame_height;
  322|   119k|      got_sequence_header = 1;
  323|   483k|    } else if (obu_header.type == OBU_FRAME_HEADER ||
  ------------------
  |  Branch (323:16): [True: 1.25k, False: 482k]
  ------------------
  324|   483k|               obu_header.type == OBU_FRAME) {
  ------------------
  |  Branch (324:16): [True: 304k, False: 178k]
  ------------------
  325|   305k|      if (got_sequence_header && reduced_still_picture_hdr) {
  ------------------
  |  Branch (325:11): [True: 84.8k, False: 220k]
  |  Branch (325:34): [True: 17.1k, False: 67.6k]
  ------------------
  326|  17.1k|        found_keyframe = 1;
  327|  17.1k|        break;
  328|   288k|      } else {
  329|       |        // make sure we have enough bits to get the frame type out
  330|   288k|        if (data_sz < 1) return AOM_CODEC_CORRUPT_FRAME;
  ------------------
  |  Branch (330:13): [True: 343, False: 287k]
  ------------------
  331|   287k|        struct aom_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
  332|   287k|        const int show_existing_frame = aom_rb_read_bit(&rb);
  333|   287k|        if (!show_existing_frame) {
  ------------------
  |  Branch (333:13): [True: 274k, False: 12.9k]
  ------------------
  334|   274k|          const FRAME_TYPE frame_type = (FRAME_TYPE)aom_rb_read_literal(&rb, 2);
  335|   274k|          if (frame_type == KEY_FRAME) {
  ------------------
  |  Branch (335:15): [True: 84.5k, False: 190k]
  ------------------
  336|  84.5k|            found_keyframe = 1;
  337|  84.5k|            break;  // Stop here as no further OBUs will change the outcome.
  338|   190k|          } else if (frame_type == INTRA_ONLY_FRAME) {
  ------------------
  |  Branch (338:22): [True: 49.4k, False: 140k]
  ------------------
  339|  49.4k|            intra_only_flag = 1;
  340|  49.4k|          }
  341|   274k|        }
  342|   287k|      }
  343|   305k|    }
  344|       |    // skip past any unread OBU header data
  345|   501k|    data += payload_size;
  346|   501k|    data_sz -= payload_size;
  347|   501k|    if (data_sz == 0) break;  // exit if we're out of OBUs
  ------------------
  |  Branch (347:9): [True: 1.93k, False: 499k]
  ------------------
  348|   499k|    status = aom_read_obu_header_and_size(
  349|   499k|        data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read);
  350|   499k|    if (status != AOM_CODEC_OK) return status;
  ------------------
  |  Branch (350:9): [True: 163k, False: 335k]
  ------------------
  351|   499k|  }
  352|   103k|  if (got_sequence_header && found_keyframe) si->is_kf = 1;
  ------------------
  |  Branch (352:7): [True: 83.3k, False: 20.2k]
  |  Branch (352:30): [True: 82.8k, False: 497]
  ------------------
  353|   103k|  if (is_intra_only != NULL) *is_intra_only = intra_only_flag;
  ------------------
  |  Branch (353:7): [True: 15.8k, False: 87.8k]
  ------------------
  354|   103k|  return AOM_CODEC_OK;
  355|   273k|}
av1_dx_iface.c:parse_operating_points:
  206|   120k|                                              aom_codec_stream_info_t *si) {
  207|   120k|  int operating_point_idc0 = 0;
  208|   120k|  if (is_reduced_header) {
  ------------------
  |  Branch (208:7): [True: 21.3k, False: 98.8k]
  ------------------
  209|  21.3k|    aom_rb_read_literal(rb, LEVEL_BITS);  // level
  ------------------
  |  |  464|  21.3k|#define LEVEL_BITS 5
  ------------------
  210|  98.8k|  } else {
  211|  98.8k|    uint8_t decoder_model_info_present_flag = 0;
  212|  98.8k|    int buffer_delay_length_minus_1 = 0;
  213|  98.8k|    aom_codec_err_t status;
  214|  98.8k|    const uint8_t timing_info_present_flag = aom_rb_read_bit(rb);
  215|  98.8k|    if (timing_info_present_flag) {
  ------------------
  |  Branch (215:9): [True: 22.0k, False: 76.8k]
  ------------------
  216|  22.0k|      if ((status = parse_timing_info(rb)) != AOM_CODEC_OK) return status;
  ------------------
  |  Branch (216:11): [True: 559, False: 21.4k]
  ------------------
  217|  21.4k|      decoder_model_info_present_flag = aom_rb_read_bit(rb);
  218|  21.4k|      if (decoder_model_info_present_flag) {
  ------------------
  |  Branch (218:11): [True: 11.6k, False: 9.79k]
  ------------------
  219|  11.6k|        if ((status = parse_decoder_model_info(
  ------------------
  |  Branch (219:13): [True: 0, False: 11.6k]
  ------------------
  220|  11.6k|                 rb, &buffer_delay_length_minus_1)) != AOM_CODEC_OK)
  221|      0|          return status;
  222|  11.6k|      }
  223|  21.4k|    }
  224|  98.3k|    const uint8_t initial_display_delay_present_flag = aom_rb_read_bit(rb);
  225|  98.3k|    const uint8_t operating_points_cnt_minus_1 =
  226|  98.3k|        aom_rb_read_literal(rb, OP_POINTS_CNT_MINUS_1_BITS);
  ------------------
  |  |   93|  98.3k|#define OP_POINTS_CNT_MINUS_1_BITS 5
  ------------------
  227|   434k|    for (int i = 0; i < operating_points_cnt_minus_1 + 1; i++) {
  ------------------
  |  Branch (227:21): [True: 336k, False: 98.3k]
  ------------------
  228|   336k|      int operating_point_idc;
  229|   336k|      operating_point_idc = aom_rb_read_literal(rb, OP_POINTS_IDC_BITS);
  ------------------
  |  |   94|   336k|#define OP_POINTS_IDC_BITS 12
  ------------------
  230|   336k|      if (i == 0) operating_point_idc0 = operating_point_idc;
  ------------------
  |  Branch (230:11): [True: 98.3k, False: 237k]
  ------------------
  231|   336k|      int seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS);  // level
  ------------------
  |  |  464|   336k|#define LEVEL_BITS 5
  ------------------
  232|   336k|      if (seq_level_idx > 7) aom_rb_read_bit(rb);               // tier
  ------------------
  |  Branch (232:11): [True: 133k, False: 203k]
  ------------------
  233|   336k|      if (decoder_model_info_present_flag) {
  ------------------
  |  Branch (233:11): [True: 78.0k, False: 258k]
  ------------------
  234|  78.0k|        const uint8_t decoder_model_present_for_this_op = aom_rb_read_bit(rb);
  235|  78.0k|        if (decoder_model_present_for_this_op) {
  ------------------
  |  Branch (235:13): [True: 25.1k, False: 52.9k]
  ------------------
  236|  25.1k|          if ((status = parse_op_parameters_info(
  ------------------
  |  Branch (236:15): [True: 0, False: 25.1k]
  ------------------
  237|  25.1k|                   rb, buffer_delay_length_minus_1)) != AOM_CODEC_OK)
  238|      0|            return status;
  239|  25.1k|        }
  240|  78.0k|      }
  241|   336k|      if (initial_display_delay_present_flag) {
  ------------------
  |  Branch (241:11): [True: 181k, False: 155k]
  ------------------
  242|   181k|        const uint8_t initial_display_delay_present_for_this_op =
  243|   181k|            aom_rb_read_bit(rb);
  244|   181k|        if (initial_display_delay_present_for_this_op)
  ------------------
  |  Branch (244:13): [True: 70.9k, False: 110k]
  ------------------
  245|  70.9k|          aom_rb_read_literal(rb, 4);  // initial_display_delay_minus_1
  246|   181k|      }
  247|   336k|    }
  248|  98.3k|  }
  249|       |
  250|   119k|  if (aom_get_num_layers_from_operating_point_idc(
  ------------------
  |  Branch (250:7): [True: 0, False: 119k]
  ------------------
  251|   119k|          operating_point_idc0, &si->number_spatial_layers,
  252|   119k|          &si->number_temporal_layers) != AOM_CODEC_OK) {
  253|      0|    return AOM_CODEC_ERROR;
  254|      0|  }
  255|       |
  256|   119k|  return AOM_CODEC_OK;
  257|   119k|}
av1_dx_iface.c:parse_timing_info:
  158|  22.0k|static aom_codec_err_t parse_timing_info(struct aom_read_bit_buffer *rb) {
  159|  22.0k|  const uint32_t num_units_in_display_tick =
  160|  22.0k|      aom_rb_read_unsigned_literal(rb, 32);
  161|  22.0k|  const uint32_t time_scale = aom_rb_read_unsigned_literal(rb, 32);
  162|  22.0k|  if (num_units_in_display_tick == 0 || time_scale == 0)
  ------------------
  |  Branch (162:7): [True: 10, False: 22.0k]
  |  Branch (162:41): [True: 448, False: 21.5k]
  ------------------
  163|    458|    return AOM_CODEC_UNSUP_BITSTREAM;
  164|  21.5k|  const uint8_t equal_picture_interval = aom_rb_read_bit(rb);
  165|  21.5k|  if (equal_picture_interval) {
  ------------------
  |  Branch (165:7): [True: 13.5k, False: 8.01k]
  ------------------
  166|  13.5k|    const uint32_t num_ticks_per_picture_minus_1 = aom_rb_read_uvlc(rb);
  167|  13.5k|    if (num_ticks_per_picture_minus_1 == UINT32_MAX) {
  ------------------
  |  Branch (167:9): [True: 101, False: 13.4k]
  ------------------
  168|       |      // num_ticks_per_picture_minus_1 cannot be (1 << 32) - 1.
  169|    101|      return AOM_CODEC_UNSUP_BITSTREAM;
  170|    101|    }
  171|  13.5k|  }
  172|  21.4k|  return AOM_CODEC_OK;
  173|  21.5k|}
av1_dx_iface.c:parse_decoder_model_info:
  176|  11.6k|    struct aom_read_bit_buffer *rb, int *buffer_delay_length_minus_1) {
  177|  11.6k|  *buffer_delay_length_minus_1 = aom_rb_read_literal(rb, 5);
  178|  11.6k|  const uint32_t num_units_in_decoding_tick =
  179|  11.6k|      aom_rb_read_unsigned_literal(rb, 32);
  180|  11.6k|  const uint8_t buffer_removal_time_length_minus_1 = aom_rb_read_literal(rb, 5);
  181|  11.6k|  const uint8_t frame_presentation_time_length_minus_1 =
  182|  11.6k|      aom_rb_read_literal(rb, 5);
  183|  11.6k|  (void)num_units_in_decoding_tick;
  184|  11.6k|  (void)buffer_removal_time_length_minus_1;
  185|  11.6k|  (void)frame_presentation_time_length_minus_1;
  186|  11.6k|  return AOM_CODEC_OK;
  187|  11.6k|}
av1_dx_iface.c:parse_op_parameters_info:
  190|  25.1k|    struct aom_read_bit_buffer *rb, int buffer_delay_length_minus_1) {
  191|  25.1k|  const int n = buffer_delay_length_minus_1 + 1;
  192|  25.1k|  const uint32_t decoder_buffer_delay = aom_rb_read_unsigned_literal(rb, n);
  193|  25.1k|  const uint32_t encoder_buffer_delay = aom_rb_read_unsigned_literal(rb, n);
  194|  25.1k|  const uint8_t low_delay_mode_flag = aom_rb_read_bit(rb);
  195|  25.1k|  (void)decoder_buffer_delay;
  196|  25.1k|  (void)encoder_buffer_delay;
  197|  25.1k|  (void)low_delay_mode_flag;
  198|  25.1k|  return AOM_CODEC_OK;
  199|  25.1k|}
av1_dx_iface.c:decoder_decode:
  677|   292k|                                      void *user_priv) {
  678|   292k|  aom_codec_err_t res = AOM_CODEC_OK;
  679|       |
  680|       |#if CONFIG_INSPECTION
  681|       |  if (user_priv != 0) {
  682|       |    return decoder_inspect(ctx, data, data_sz, user_priv);
  683|       |  }
  684|       |#endif
  685|       |
  686|   292k|  release_pending_output_frames(ctx);
  687|       |
  688|       |  /* Sanity checks */
  689|       |  /* NULL data ptr allowed if data_sz is 0 too */
  690|   292k|  if (data == NULL && data_sz == 0) {
  ------------------
  |  Branch (690:7): [True: 0, False: 292k]
  |  Branch (690:23): [True: 0, False: 0]
  ------------------
  691|      0|    ctx->flushed = 1;
  692|      0|    return AOM_CODEC_OK;
  693|      0|  }
  694|   292k|  if (data == NULL || data_sz == 0) return AOM_CODEC_INVALID_PARAM;
  ------------------
  |  Branch (694:7): [True: 0, False: 292k]
  |  Branch (694:23): [True: 14.4k, False: 277k]
  ------------------
  695|       |
  696|       |  // Reset flushed when receiving a valid frame.
  697|   277k|  ctx->flushed = 0;
  698|       |
  699|       |  // Initialize the decoder worker on the first frame.
  700|   277k|  if (ctx->frame_worker == NULL) {
  ------------------
  |  Branch (700:7): [True: 16.1k, False: 261k]
  ------------------
  701|  16.1k|    res = init_decoder(ctx);
  702|  16.1k|    if (res != AOM_CODEC_OK) return res;
  ------------------
  |  Branch (702:9): [True: 0, False: 16.1k]
  ------------------
  703|  16.1k|  }
  704|       |
  705|   277k|  const uint8_t *data_start = data;
  706|   277k|  const uint8_t *data_end = data + data_sz;
  707|       |
  708|   277k|  if (ctx->is_annexb) {
  ------------------
  |  Branch (708:7): [True: 3.90k, False: 273k]
  ------------------
  709|       |    // read the size of this temporal unit
  710|  3.90k|    size_t length_of_size;
  711|  3.90k|    uint64_t temporal_unit_size;
  712|  3.90k|    if (aom_uleb_decode(data_start, data_sz, &temporal_unit_size,
  ------------------
  |  Branch (712:9): [True: 172, False: 3.73k]
  ------------------
  713|  3.90k|                        &length_of_size) != 0) {
  714|    172|      return AOM_CODEC_CORRUPT_FRAME;
  715|    172|    }
  716|  3.73k|    data_start += length_of_size;
  717|  3.73k|    if (temporal_unit_size > (size_t)(data_end - data_start))
  ------------------
  |  Branch (717:9): [True: 750, False: 2.98k]
  ------------------
  718|    750|      return AOM_CODEC_CORRUPT_FRAME;
  719|  2.98k|    data_end = data_start + temporal_unit_size;
  720|  2.98k|  }
  721|       |
  722|       |  // Decode in serial mode.
  723|   382k|  while (data_start < data_end) {
  ------------------
  |  Branch (723:10): [True: 319k, False: 63.6k]
  ------------------
  724|   319k|    uint64_t frame_size;
  725|   319k|    if (ctx->is_annexb) {
  ------------------
  |  Branch (725:9): [True: 3.14k, False: 315k]
  ------------------
  726|       |      // read the size of this frame unit
  727|  3.14k|      size_t length_of_size;
  728|  3.14k|      if (aom_uleb_decode(data_start, (size_t)(data_end - data_start),
  ------------------
  |  Branch (728:11): [True: 17, False: 3.12k]
  ------------------
  729|  3.14k|                          &frame_size, &length_of_size) != 0) {
  730|     17|        return AOM_CODEC_CORRUPT_FRAME;
  731|     17|      }
  732|  3.12k|      data_start += length_of_size;
  733|  3.12k|      if (frame_size > (size_t)(data_end - data_start))
  ------------------
  |  Branch (733:11): [True: 921, False: 2.20k]
  ------------------
  734|    921|        return AOM_CODEC_CORRUPT_FRAME;
  735|   315k|    } else {
  736|   315k|      frame_size = (uint64_t)(data_end - data_start);
  737|   315k|    }
  738|       |
  739|   318k|    res = decode_one(ctx, &data_start, (size_t)frame_size, user_priv);
  740|   318k|    if (res != AOM_CODEC_OK) return res;
  ------------------
  |  Branch (740:9): [True: 212k, False: 105k]
  ------------------
  741|       |
  742|       |    // Allow extra zero bytes after the frame end
  743|   176k|    while (data_start < data_end) {
  ------------------
  |  Branch (743:12): [True: 113k, False: 63.4k]
  ------------------
  744|   113k|      const uint8_t marker = data_start[0];
  745|   113k|      if (marker) break;
  ------------------
  |  Branch (745:11): [True: 42.5k, False: 70.6k]
  ------------------
  746|  70.6k|      ++data_start;
  747|  70.6k|    }
  748|   105k|  }
  749|       |
  750|  63.6k|  return res;
  751|   276k|}
av1_dx_iface.c:release_pending_output_frames:
  565|   292k|static void release_pending_output_frames(aom_codec_alg_priv_t *ctx) {
  566|       |  // Release any pending output frames from the previous decoder_decode or
  567|       |  // decoder_inspect call. We need to do this even if the decoder is being
  568|       |  // flushed or the input arguments are invalid.
  569|   292k|  if (ctx->frame_worker) {
  ------------------
  |  Branch (569:7): [True: 272k, False: 19.1k]
  ------------------
  570|   272k|    BufferPool *const pool = ctx->buffer_pool;
  571|   272k|    lock_buffer_pool(pool);
  572|   272k|    AVxWorker *const worker = ctx->frame_worker;
  573|   272k|    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
  574|   272k|    struct AV1Decoder *pbi = frame_worker_data->pbi;
  575|   335k|    for (size_t j = 0; j < pbi->num_output_frames; j++) {
  ------------------
  |  Branch (575:24): [True: 62.6k, False: 272k]
  ------------------
  576|  62.6k|      decrease_ref_count(pbi->output_frames[j], pool);
  577|  62.6k|    }
  578|   272k|    pbi->num_output_frames = 0;
  579|   272k|    unlock_buffer_pool(pool);
  580|   287k|    for (size_t j = 0; j < ctx->num_grain_image_frame_buffers; j++) {
  ------------------
  |  Branch (580:24): [True: 14.2k, False: 272k]
  ------------------
  581|  14.2k|      pool->release_fb_cb(pool->cb_priv, &ctx->grain_image_frame_buffers[j]);
  582|  14.2k|      ctx->grain_image_frame_buffers[j].data = NULL;
  583|  14.2k|      ctx->grain_image_frame_buffers[j].size = 0;
  584|  14.2k|      ctx->grain_image_frame_buffers[j].priv = NULL;
  585|  14.2k|    }
  586|   272k|    ctx->num_grain_image_frame_buffers = 0;
  587|   272k|  }
  588|   292k|}
av1_dx_iface.c:init_decoder:
  426|  16.1k|static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) {
  427|  16.1k|  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
  428|       |
  429|  16.1k|  ctx->last_show_frame = NULL;
  430|  16.1k|  ctx->need_resync = 1;
  431|  16.1k|  ctx->flushed = 0;
  432|       |
  433|  16.1k|  ctx->buffer_pool = (BufferPool *)aom_calloc(1, sizeof(BufferPool));
  434|  16.1k|  if (ctx->buffer_pool == NULL) return AOM_CODEC_MEM_ERROR;
  ------------------
  |  Branch (434:7): [True: 0, False: 16.1k]
  ------------------
  435|  16.1k|  ctx->buffer_pool->num_frame_bufs = FRAME_BUFFERS;
  ------------------
  |  |  561|  16.1k|#define FRAME_BUFFERS (REF_FRAMES + 1 + INTER_REFS_PER_FRAME)
  ------------------
  436|  16.1k|  ctx->buffer_pool->frame_bufs = (RefCntBuffer *)aom_calloc(
  437|  16.1k|      ctx->buffer_pool->num_frame_bufs, sizeof(*ctx->buffer_pool->frame_bufs));
  438|  16.1k|  if (ctx->buffer_pool->frame_bufs == NULL) {
  ------------------
  |  Branch (438:7): [True: 0, False: 16.1k]
  ------------------
  439|      0|    ctx->buffer_pool->num_frame_bufs = 0;
  440|      0|    aom_free(ctx->buffer_pool);
  441|      0|    ctx->buffer_pool = NULL;
  442|      0|    return AOM_CODEC_MEM_ERROR;
  443|      0|  }
  444|       |
  445|  16.1k|#if CONFIG_MULTITHREAD
  446|  16.1k|  if (pthread_mutex_init(&ctx->buffer_pool->pool_mutex, NULL)) {
  ------------------
  |  Branch (446:7): [True: 0, False: 16.1k]
  ------------------
  447|      0|    aom_free(ctx->buffer_pool->frame_bufs);
  448|      0|    ctx->buffer_pool->frame_bufs = NULL;
  449|      0|    ctx->buffer_pool->num_frame_bufs = 0;
  450|      0|    aom_free(ctx->buffer_pool);
  451|      0|    ctx->buffer_pool = NULL;
  452|      0|    set_error_detail(ctx, "Failed to allocate buffer pool mutex");
  453|      0|    return AOM_CODEC_MEM_ERROR;
  454|      0|  }
  455|  16.1k|#endif
  456|       |
  457|  16.1k|  ctx->frame_worker = (AVxWorker *)aom_malloc(sizeof(*ctx->frame_worker));
  458|  16.1k|  if (ctx->frame_worker == NULL) {
  ------------------
  |  Branch (458:7): [True: 0, False: 16.1k]
  ------------------
  459|      0|    set_error_detail(ctx, "Failed to allocate frame_worker");
  460|      0|    return AOM_CODEC_MEM_ERROR;
  461|      0|  }
  462|       |
  463|  16.1k|  AVxWorker *const worker = ctx->frame_worker;
  464|  16.1k|  winterface->init(worker);
  465|  16.1k|  worker->thread_name = "aom frameworker";
  466|  16.1k|  worker->data1 = aom_memalign(32, sizeof(FrameWorkerData));
  467|  16.1k|  if (worker->data1 == NULL) {
  ------------------
  |  Branch (467:7): [True: 0, False: 16.1k]
  ------------------
  468|      0|    winterface->end(worker);
  469|      0|    aom_free(worker);
  470|      0|    ctx->frame_worker = NULL;
  471|      0|    set_error_detail(ctx, "Failed to allocate frame_worker_data");
  472|      0|    return AOM_CODEC_MEM_ERROR;
  473|      0|  }
  474|  16.1k|  FrameWorkerData *frame_worker_data = (FrameWorkerData *)worker->data1;
  475|  16.1k|  frame_worker_data->pbi = av1_decoder_create(ctx->buffer_pool);
  476|  16.1k|  if (frame_worker_data->pbi == NULL) {
  ------------------
  |  Branch (476:7): [True: 0, False: 16.1k]
  ------------------
  477|      0|    winterface->end(worker);
  478|      0|    aom_free(frame_worker_data);
  479|      0|    aom_free(worker);
  480|      0|    ctx->frame_worker = NULL;
  481|      0|    set_error_detail(ctx, "Failed to allocate frame_worker_data->pbi");
  482|      0|    return AOM_CODEC_MEM_ERROR;
  483|      0|  }
  484|  16.1k|  frame_worker_data->frame_context_ready = 0;
  485|  16.1k|  frame_worker_data->received_frame = 0;
  486|  16.1k|  frame_worker_data->pbi->allow_lowbitdepth = ctx->cfg.allow_lowbitdepth;
  487|       |
  488|       |  // If decoding in serial mode, FrameWorker thread could create tile worker
  489|       |  // thread or loopfilter thread.
  490|  16.1k|  frame_worker_data->pbi->max_threads = ctx->cfg.threads;
  491|  16.1k|  frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order;
  492|  16.1k|  frame_worker_data->pbi->common.tiles.large_scale = ctx->tile_mode;
  493|  16.1k|  frame_worker_data->pbi->is_annexb = ctx->is_annexb;
  494|  16.1k|  frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row;
  495|  16.1k|  frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col;
  496|  16.1k|  frame_worker_data->pbi->operating_point = ctx->operating_point;
  497|  16.1k|  frame_worker_data->pbi->output_all_layers = ctx->output_all_layers;
  498|  16.1k|  frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
  499|  16.1k|  frame_worker_data->pbi->row_mt = ctx->row_mt;
  500|  16.1k|  frame_worker_data->pbi->is_fwd_kf_present = 0;
  501|  16.1k|  frame_worker_data->pbi->is_arf_frame_present = 0;
  502|  16.1k|  worker->hook = frame_worker_hook;
  503|       |
  504|  16.1k|  init_buffer_callbacks(ctx);
  505|       |
  506|  16.1k|  return AOM_CODEC_OK;
  507|  16.1k|}
av1_dx_iface.c:set_error_detail:
  370|   205k|                             const char *const error) {
  371|   205k|  ctx->base.err_detail = error;
  372|   205k|}
av1_dx_iface.c:frame_worker_hook:
  410|   311k|static int frame_worker_hook(void *arg1, void *arg2) {
  411|   311k|  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)arg1;
  412|   311k|  const uint8_t *data = frame_worker_data->data;
  413|   311k|  (void)arg2;
  414|       |
  415|   311k|  int result = av1_receive_compressed_data(frame_worker_data->pbi,
  416|   311k|                                           frame_worker_data->data_size, &data);
  417|   311k|  frame_worker_data->data_end = data;
  418|       |
  419|   311k|  if (result != 0) {
  ------------------
  |  Branch (419:7): [True: 205k, False: 105k]
  ------------------
  420|       |    // Check decode result in serial decode.
  421|   205k|    frame_worker_data->pbi->need_resync = 1;
  422|   205k|  }
  423|   311k|  return !result;
  424|   311k|}
av1_dx_iface.c:init_buffer_callbacks:
  382|  16.1k|static void init_buffer_callbacks(aom_codec_alg_priv_t *ctx) {
  383|  16.1k|  AVxWorker *const worker = ctx->frame_worker;
  384|  16.1k|  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
  385|  16.1k|  AV1Decoder *const pbi = frame_worker_data->pbi;
  386|  16.1k|  AV1_COMMON *const cm = &pbi->common;
  387|  16.1k|  BufferPool *const pool = cm->buffer_pool;
  388|       |
  389|  16.1k|  cm->cur_frame = NULL;
  390|  16.1k|  cm->features.byte_alignment = ctx->byte_alignment;
  391|  16.1k|  pbi->skip_loop_filter = ctx->skip_loop_filter;
  392|  16.1k|  pbi->skip_film_grain = ctx->skip_film_grain;
  393|       |
  394|  16.1k|  if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
  ------------------
  |  Branch (394:7): [True: 0, False: 16.1k]
  |  Branch (394:37): [True: 0, False: 0]
  ------------------
  395|      0|    pool->get_fb_cb = ctx->get_ext_fb_cb;
  396|      0|    pool->release_fb_cb = ctx->release_ext_fb_cb;
  397|      0|    pool->cb_priv = ctx->ext_priv;
  398|  16.1k|  } else {
  399|  16.1k|    pool->get_fb_cb = av1_get_frame_buffer;
  400|  16.1k|    pool->release_fb_cb = av1_release_frame_buffer;
  401|       |
  402|  16.1k|    if (av1_alloc_internal_frame_buffers(&pool->int_frame_buffers))
  ------------------
  |  Branch (402:9): [True: 0, False: 16.1k]
  ------------------
  403|      0|      aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
  404|      0|                         "Failed to initialize internal frame buffers");
  405|       |
  406|  16.1k|    pool->cb_priv = &pool->int_frame_buffers;
  407|  16.1k|  }
  408|  16.1k|}
av1_dx_iface.c:decode_one:
  519|   318k|                                  void *user_priv) {
  520|   318k|  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
  521|       |
  522|       |  // Determine the stream parameters. Note that we rely on peek_si to
  523|       |  // validate that we have a buffer that does not wrap around the top
  524|       |  // of the heap.
  525|   318k|  if (!ctx->si.h) {
  ------------------
  |  Branch (525:7): [True: 21.2k, False: 296k]
  ------------------
  526|  21.2k|    int is_intra_only = 0;
  527|  21.2k|    ctx->si.is_annexb = ctx->is_annexb;
  528|  21.2k|    const aom_codec_err_t res =
  529|  21.2k|        decoder_peek_si_internal(*data, data_sz, &ctx->si, &is_intra_only);
  530|  21.2k|    if (res != AOM_CODEC_OK) return res;
  ------------------
  |  Branch (530:9): [True: 5.43k, False: 15.8k]
  ------------------
  531|       |
  532|  15.8k|    if (!ctx->si.is_kf && !is_intra_only) return AOM_CODEC_ERROR;
  ------------------
  |  Branch (532:9): [True: 1.58k, False: 14.2k]
  |  Branch (532:27): [True: 946, False: 634]
  ------------------
  533|  15.8k|  }
  534|       |
  535|   311k|  AVxWorker *const worker = ctx->frame_worker;
  536|   311k|  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
  537|   311k|  frame_worker_data->data = *data;
  538|   311k|  frame_worker_data->data_size = data_sz;
  539|   311k|  frame_worker_data->user_priv = user_priv;
  540|   311k|  frame_worker_data->received_frame = 1;
  541|       |
  542|   311k|  frame_worker_data->pbi->common.tiles.large_scale = ctx->tile_mode;
  543|   311k|  frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row;
  544|   311k|  frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col;
  545|   311k|  frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
  546|   311k|  frame_worker_data->pbi->row_mt = ctx->row_mt;
  547|   311k|  frame_worker_data->pbi->ext_refs = ctx->ext_refs;
  548|       |
  549|   311k|  frame_worker_data->pbi->is_annexb = ctx->is_annexb;
  550|       |
  551|   311k|  worker->had_error = 0;
  552|   311k|  winterface->execute(worker);
  553|       |
  554|       |  // Update data pointer after decode.
  555|   311k|  *data = frame_worker_data->data_end;
  556|       |
  557|   311k|  if (worker->had_error)
  ------------------
  |  Branch (557:7): [True: 205k, False: 105k]
  ------------------
  558|   205k|    return update_error_state(ctx, &frame_worker_data->pbi->error);
  559|       |
  560|   105k|  check_resync(ctx, frame_worker_data->pbi);
  561|       |
  562|   105k|  return AOM_CODEC_OK;
  563|   311k|}
av1_dx_iface.c:update_error_state:
  375|   205k|    aom_codec_alg_priv_t *ctx, const struct aom_internal_error_info *error) {
  376|   205k|  if (error->error_code)
  ------------------
  |  Branch (376:7): [True: 205k, False: 0]
  ------------------
  377|   205k|    set_error_detail(ctx, error->has_detail ? error->detail : NULL);
  ------------------
  |  Branch (377:27): [True: 145k, False: 60.6k]
  ------------------
  378|       |
  379|   205k|  return error->error_code;
  380|   205k|}
av1_dx_iface.c:check_resync:
  510|   169k|                                const AV1Decoder *const pbi) {
  511|       |  // Clear resync flag if worker got a key frame or intra only frame.
  512|   169k|  if (ctx->need_resync == 1 && pbi->need_resync == 0 &&
  ------------------
  |  Branch (512:7): [True: 41.7k, False: 127k]
  |  Branch (512:32): [True: 40.4k, False: 1.30k]
  ------------------
  513|   169k|      frame_is_intra_only(&pbi->common))
  ------------------
  |  Branch (513:7): [True: 40.4k, False: 0]
  ------------------
  514|  40.4k|    ctx->need_resync = 0;
  515|   169k|}
av1_dx_iface.c:decoder_get_frame:
  809|   354k|                                      aom_codec_iter_t *iter) {
  810|   354k|  aom_image_t *img = NULL;
  811|       |
  812|   354k|  if (!iter) {
  ------------------
  |  Branch (812:7): [True: 0, False: 354k]
  ------------------
  813|      0|    return NULL;
  814|      0|  }
  815|       |
  816|       |  // To avoid having to allocate any extra storage, treat 'iter' as
  817|       |  // simply a pointer to an integer index
  818|   354k|  uintptr_t *index = (uintptr_t *)iter;
  819|       |
  820|   354k|  if (ctx->frame_worker == NULL) {
  ------------------
  |  Branch (820:7): [True: 3.04k, False: 351k]
  ------------------
  821|  3.04k|    return NULL;
  822|  3.04k|  }
  823|   351k|  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
  824|   351k|  AVxWorker *const worker = ctx->frame_worker;
  825|   351k|  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
  826|   351k|  AV1Decoder *const pbi = frame_worker_data->pbi;
  827|   351k|  pbi->error.error_code = AOM_CODEC_OK;
  828|   351k|  pbi->error.has_detail = 0;
  829|   351k|  AV1_COMMON *const cm = &pbi->common;
  830|   351k|  CommonTileParams *const tiles = &cm->tiles;
  831|       |  // Wait for the frame from worker thread.
  832|   351k|  if (!winterface->sync(worker)) {
  ------------------
  |  Branch (832:7): [True: 215k, False: 135k]
  ------------------
  833|       |    // Decoding failed. Release the worker thread.
  834|   215k|    frame_worker_data->received_frame = 0;
  835|   215k|    ctx->need_resync = 1;
  836|       |    // TODO(aomedia:3519): Set an error code. Check if a different error code
  837|       |    // should be used if ctx->flushed != 1.
  838|   215k|    return NULL;
  839|   215k|  }
  840|       |  // Check if worker has received any frames.
  841|   135k|  if (frame_worker_data->received_frame == 1) {
  ------------------
  |  Branch (841:7): [True: 63.7k, False: 71.9k]
  ------------------
  842|  63.7k|    frame_worker_data->received_frame = 0;
  843|  63.7k|    check_resync(ctx, frame_worker_data->pbi);
  844|  63.7k|  }
  845|   135k|  YV12_BUFFER_CONFIG *sd;
  846|   135k|  aom_film_grain_t *grain_params;
  847|   135k|  if (av1_get_raw_frame(frame_worker_data->pbi, *index, &sd, &grain_params) !=
  ------------------
  |  Branch (847:7): [True: 73.3k, False: 62.4k]
  ------------------
  848|   135k|      0) {
  849|  73.3k|    return NULL;
  850|  73.3k|  }
  851|  62.4k|  RefCntBuffer *const output_frame_buf = pbi->output_frames[*index];
  852|  62.4k|  ctx->last_show_frame = output_frame_buf;
  853|  62.4k|  if (ctx->need_resync) return NULL;
  ------------------
  |  Branch (853:7): [True: 31, False: 62.4k]
  ------------------
  854|  62.4k|  aom_img_remove_metadata(&ctx->img);
  855|  62.4k|  yuvconfig2image(&ctx->img, sd, frame_worker_data->user_priv);
  856|  62.4k|  move_decoder_metadata_to_img(pbi, &ctx->img);
  857|       |
  858|  62.4k|  if (!pbi->ext_tile_debug && tiles->large_scale) {
  ------------------
  |  Branch (858:7): [True: 51.1k, False: 11.2k]
  |  Branch (858:31): [True: 13.6k, False: 37.4k]
  ------------------
  859|  13.6k|    *index += 1;  // Advance the iterator to point to the next image
  860|  13.6k|    aom_img_remove_metadata(&ctx->img);
  861|  13.6k|    yuvconfig2image(&ctx->img, &pbi->tile_list_outbuf, NULL);
  862|  13.6k|    move_decoder_metadata_to_img(pbi, &ctx->img);
  863|  13.6k|    img = &ctx->img;
  864|  13.6k|    return img;
  865|  13.6k|  }
  866|       |
  867|  48.7k|  const int num_planes = av1_num_planes(cm);
  868|  48.7k|  if (pbi->ext_tile_debug && tiles->single_tile_decoding &&
  ------------------
  |  Branch (868:7): [True: 11.2k, False: 37.4k]
  |  Branch (868:30): [True: 1.29k, False: 9.93k]
  ------------------
  869|  48.7k|      pbi->dec_tile_row >= 0) {
  ------------------
  |  Branch (869:7): [True: 0, False: 1.29k]
  ------------------
  870|      0|    int tile_width, tile_height;
  871|      0|    if (!av1_get_uniform_tile_size(cm, &tile_width, &tile_height)) {
  ------------------
  |  Branch (871:9): [True: 0, False: 0]
  ------------------
  872|      0|      return NULL;
  873|      0|    }
  874|      0|    const int tile_row = AOMMIN(pbi->dec_tile_row, tiles->rows - 1);
  ------------------
  |  |   34|      0|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  875|      0|    const int mi_row = tile_row * tile_height;
  876|      0|    const int ssy = ctx->img.y_chroma_shift;
  877|      0|    int plane;
  878|      0|    ctx->img.planes[0] += mi_row * MI_SIZE * ctx->img.stride[0];
  ------------------
  |  |   40|      0|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  879|      0|    if (num_planes > 1) {
  ------------------
  |  Branch (879:9): [True: 0, False: 0]
  ------------------
  880|      0|      for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
  ------------------
  |  |   36|      0|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (880:23): [True: 0, False: 0]
  ------------------
  881|      0|        ctx->img.planes[plane] +=
  882|      0|            mi_row * (MI_SIZE >> ssy) * ctx->img.stride[plane];
  ------------------
  |  |   40|      0|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  883|      0|      }
  884|      0|    }
  885|      0|    ctx->img.d_h =
  886|      0|        AOMMIN(tile_height, cm->mi_params.mi_rows - mi_row) * MI_SIZE;
  ------------------
  |  |   34|      0|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                      AOMMIN(tile_height, cm->mi_params.mi_rows - mi_row) * MI_SIZE;
  ------------------
  |  |   40|      0|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  887|      0|  }
  888|       |
  889|  48.7k|  if (pbi->ext_tile_debug && tiles->single_tile_decoding &&
  ------------------
  |  Branch (889:7): [True: 11.2k, False: 37.4k]
  |  Branch (889:30): [True: 1.29k, False: 9.93k]
  ------------------
  890|  48.7k|      pbi->dec_tile_col >= 0) {
  ------------------
  |  Branch (890:7): [True: 0, False: 1.29k]
  ------------------
  891|      0|    int tile_width, tile_height;
  892|      0|    if (!av1_get_uniform_tile_size(cm, &tile_width, &tile_height)) {
  ------------------
  |  Branch (892:9): [True: 0, False: 0]
  ------------------
  893|      0|      return NULL;
  894|      0|    }
  895|      0|    const int tile_col = AOMMIN(pbi->dec_tile_col, tiles->cols - 1);
  ------------------
  |  |   34|      0|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  896|      0|    const int mi_col = tile_col * tile_width;
  897|      0|    const int ssx = ctx->img.x_chroma_shift;
  898|      0|    const int is_hbd = (ctx->img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0;
  ------------------
  |  |   38|      0|#define AOM_IMG_FMT_HIGHBITDEPTH 0x800 /**< Image uses 16bit framebuffer. */
  ------------------
  |  Branch (898:24): [True: 0, False: 0]
  ------------------
  899|      0|    int plane;
  900|      0|    ctx->img.planes[0] += mi_col * MI_SIZE * (1 + is_hbd);
  ------------------
  |  |   40|      0|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  901|      0|    if (num_planes > 1) {
  ------------------
  |  Branch (901:9): [True: 0, False: 0]
  ------------------
  902|      0|      for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
  ------------------
  |  |   36|      0|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (902:23): [True: 0, False: 0]
  ------------------
  903|      0|        ctx->img.planes[plane] += mi_col * (MI_SIZE >> ssx) * (1 + is_hbd);
  ------------------
  |  |   40|      0|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  904|      0|      }
  905|      0|    }
  906|      0|    ctx->img.d_w = AOMMIN(tile_width, cm->mi_params.mi_cols - mi_col) * MI_SIZE;
  ------------------
  |  |   34|      0|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
                  ctx->img.d_w = AOMMIN(tile_width, cm->mi_params.mi_cols - mi_col) * MI_SIZE;
  ------------------
  |  |   40|      0|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  907|      0|  }
  908|       |
  909|  48.7k|  ctx->img.fb_priv = output_frame_buf->raw_frame_buffer.priv;
  910|  48.7k|  img = &ctx->img;
  911|  48.7k|  img->temporal_id = output_frame_buf->temporal_id;
  912|  48.7k|  img->spatial_id = output_frame_buf->spatial_id;
  913|  48.7k|  if (pbi->skip_film_grain) grain_params->apply_grain = 0;
  ------------------
  |  Branch (913:7): [True: 0, False: 48.7k]
  ------------------
  914|  48.7k|  aom_image_t *res =
  915|  48.7k|      add_grain_if_needed(ctx, img, &ctx->image_with_grain, grain_params);
  916|  48.7k|  if (!res) {
  ------------------
  |  Branch (916:7): [True: 0, False: 48.7k]
  ------------------
  917|      0|    pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
  918|      0|    pbi->error.has_detail = 1;
  919|      0|    snprintf(pbi->error.detail, sizeof(pbi->error.detail),
  920|      0|             "Grain synthesis failed\n");
  921|      0|    return res;
  922|      0|  }
  923|  48.7k|  *index += 1;  // Advance the iterator to point to the next image
  924|  48.7k|  return res;
  925|  48.7k|}
av1_dx_iface.c:move_decoder_metadata_to_img:
  800|  76.1k|static void move_decoder_metadata_to_img(AV1Decoder *pbi, aom_image_t *img) {
  801|  76.1k|  if (pbi->metadata && img) {
  ------------------
  |  Branch (801:7): [True: 44, False: 76.0k]
  |  Branch (801:24): [True: 44, False: 0]
  ------------------
  802|     44|    assert(!img->metadata);
  803|     44|    img->metadata = pbi->metadata;
  804|     44|    pbi->metadata = NULL;
  805|     44|  }
  806|  76.1k|}
av1_dx_iface.c:add_grain_if_needed:
  771|  48.7k|                                        aom_film_grain_t *grain_params) {
  772|  48.7k|  if (!grain_params->apply_grain) return img;
  ------------------
  |  Branch (772:7): [True: 34.0k, False: 14.6k]
  ------------------
  773|       |
  774|  14.6k|  const int w_even = ALIGN_POWER_OF_TWO_UNSIGNED(img->d_w, 1);
  ------------------
  |  |   71|  14.6k|  (((value) + ((1u << (n)) - 1)) & ~((1u << (n)) - 1))
  ------------------
  775|  14.6k|  const int h_even = ALIGN_POWER_OF_TWO_UNSIGNED(img->d_h, 1);
  ------------------
  |  |   71|  14.6k|  (((value) + ((1u << (n)) - 1)) & ~((1u << (n)) - 1))
  ------------------
  776|       |
  777|  14.6k|  BufferPool *const pool = ctx->buffer_pool;
  778|  14.6k|  aom_codec_frame_buffer_t *fb =
  779|  14.6k|      &ctx->grain_image_frame_buffers[ctx->num_grain_image_frame_buffers];
  780|  14.6k|  AllocCbParam param;
  781|  14.6k|  param.pool = pool;
  782|  14.6k|  param.fb = fb;
  783|  14.6k|  if (!aom_img_alloc_with_cb(grain_img, img->fmt, w_even, h_even, 16,
  ------------------
  |  Branch (783:7): [True: 0, False: 14.6k]
  ------------------
  784|  14.6k|                             AllocWithGetFrameBufferCb, &param)) {
  785|      0|    return NULL;
  786|      0|  }
  787|       |
  788|  14.6k|  grain_img->user_priv = img->user_priv;
  789|  14.6k|  grain_img->fb_priv = fb->priv;
  790|  14.6k|  if (av1_add_film_grain(grain_params, img, grain_img)) {
  ------------------
  |  Branch (790:7): [True: 0, False: 14.6k]
  ------------------
  791|      0|    pool->release_fb_cb(pool->cb_priv, fb);
  792|      0|    return NULL;
  793|      0|  }
  794|       |
  795|  14.6k|  ctx->num_grain_image_frame_buffers++;
  796|  14.6k|  return grain_img;
  797|  14.6k|}
av1_dx_iface.c:AllocWithGetFrameBufferCb:
  758|  14.6k|static void *AllocWithGetFrameBufferCb(void *priv, size_t size) {
  759|  14.6k|  AllocCbParam *param = (AllocCbParam *)priv;
  760|  14.6k|  if (param->pool->get_fb_cb(param->pool->cb_priv, size, param->fb) < 0)
  ------------------
  |  Branch (760:7): [True: 0, False: 14.6k]
  ------------------
  761|      0|    return NULL;
  762|  14.6k|  if (param->fb->data == NULL || param->fb->size < size) return NULL;
  ------------------
  |  Branch (762:7): [True: 0, False: 14.6k]
  |  Branch (762:34): [True: 0, False: 14.6k]
  ------------------
  763|  14.6k|  return param->fb->data;
  764|  14.6k|}

av1_dx_iface.c:yuvconfig2image:
   23|  76.1k|                                   void *user_priv) {
   24|       |  /* aom_img_wrap() doesn't allow specifying independent strides for
   25|       |   * the Y, U, and V planes, nor other alignment adjustments that
   26|       |   * might be representable by a YV12_BUFFER_CONFIG, so we just
   27|       |   * initialize all the fields.
   28|       |   */
   29|  76.1k|  int bps;
   30|  76.1k|  if (!yv12->subsampling_y) {
  ------------------
  |  Branch (30:7): [True: 21.8k, False: 54.2k]
  ------------------
   31|  21.8k|    if (!yv12->subsampling_x) {
  ------------------
  |  Branch (31:9): [True: 19.4k, False: 2.38k]
  ------------------
   32|  19.4k|      img->fmt = AOM_IMG_FMT_I444;
   33|  19.4k|      bps = 24;
   34|  19.4k|    } else {
   35|  2.38k|      img->fmt = AOM_IMG_FMT_I422;
   36|  2.38k|      bps = 16;
   37|  2.38k|    }
   38|  54.2k|  } else {
   39|  54.2k|    img->fmt = AOM_IMG_FMT_I420;
   40|  54.2k|    bps = 12;
   41|  54.2k|  }
   42|  76.1k|  img->cp = yv12->color_primaries;
   43|  76.1k|  img->tc = yv12->transfer_characteristics;
   44|  76.1k|  img->mc = yv12->matrix_coefficients;
   45|  76.1k|  img->monochrome = yv12->monochrome;
   46|  76.1k|  img->csp = yv12->chroma_sample_position;
   47|  76.1k|  img->range = yv12->color_range;
   48|  76.1k|  img->bit_depth = 8;
   49|  76.1k|  img->w = yv12->y_width;
   50|  76.1k|  img->h = yv12->y_height;
   51|  76.1k|  img->d_w = yv12->y_crop_width;
   52|  76.1k|  img->d_h = yv12->y_crop_height;
   53|  76.1k|  img->r_w = yv12->render_width;
   54|  76.1k|  img->r_h = yv12->render_height;
   55|  76.1k|  img->x_chroma_shift = yv12->subsampling_x;
   56|  76.1k|  img->y_chroma_shift = yv12->subsampling_y;
   57|  76.1k|  img->planes[AOM_PLANE_Y] = yv12->y_buffer;
  ------------------
  |  |  226|  76.1k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
   58|  76.1k|  img->planes[AOM_PLANE_U] = yv12->u_buffer;
  ------------------
  |  |  227|  76.1k|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
   59|  76.1k|  img->planes[AOM_PLANE_V] = yv12->v_buffer;
  ------------------
  |  |  228|  76.1k|#define AOM_PLANE_V 2      /**< V (Chroma) plane */
  ------------------
   60|  76.1k|  img->stride[AOM_PLANE_Y] = yv12->y_stride;
  ------------------
  |  |  226|  76.1k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
   61|  76.1k|  img->stride[AOM_PLANE_U] = yv12->uv_stride;
  ------------------
  |  |  227|  76.1k|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
   62|  76.1k|  img->stride[AOM_PLANE_V] = yv12->uv_stride;
  ------------------
  |  |  228|  76.1k|#define AOM_PLANE_V 2      /**< V (Chroma) plane */
  ------------------
   63|  76.1k|  if (yv12->flags & YV12_FLAG_HIGHBITDEPTH) {
  ------------------
  |  |  142|  76.1k|#define YV12_FLAG_HIGHBITDEPTH 8
  ------------------
  |  Branch (63:7): [True: 17.4k, False: 58.6k]
  ------------------
   64|  17.4k|    bps *= 2;
   65|       |    // aom_image_t uses byte strides and a pointer to the first byte
   66|       |    // of the image.
   67|  17.4k|    img->fmt = (aom_img_fmt_t)(img->fmt | AOM_IMG_FMT_HIGHBITDEPTH);
  ------------------
  |  |   38|  17.4k|#define AOM_IMG_FMT_HIGHBITDEPTH 0x800 /**< Image uses 16bit framebuffer. */
  ------------------
   68|  17.4k|    img->bit_depth = yv12->bit_depth;
   69|  17.4k|    img->planes[AOM_PLANE_Y] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->y_buffer);
  ------------------
  |  |  226|  17.4k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
                  img->planes[AOM_PLANE_Y] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->y_buffer);
  ------------------
  |  |   75|  17.4k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
   70|  17.4k|    img->planes[AOM_PLANE_U] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->u_buffer);
  ------------------
  |  |  227|  17.4k|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
                  img->planes[AOM_PLANE_U] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->u_buffer);
  ------------------
  |  |   75|  17.4k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
   71|  17.4k|    img->planes[AOM_PLANE_V] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->v_buffer);
  ------------------
  |  |  228|  17.4k|#define AOM_PLANE_V 2      /**< V (Chroma) plane */
  ------------------
                  img->planes[AOM_PLANE_V] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->v_buffer);
  ------------------
  |  |   75|  17.4k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
   72|  17.4k|    img->stride[AOM_PLANE_Y] = 2 * yv12->y_stride;
  ------------------
  |  |  226|  17.4k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
   73|  17.4k|    img->stride[AOM_PLANE_U] = 2 * yv12->uv_stride;
  ------------------
  |  |  227|  17.4k|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
   74|  17.4k|    img->stride[AOM_PLANE_V] = 2 * yv12->uv_stride;
  ------------------
  |  |  228|  17.4k|#define AOM_PLANE_V 2      /**< V (Chroma) plane */
  ------------------
   75|  17.4k|  }
   76|  76.1k|  img->bps = bps;
   77|  76.1k|  img->user_priv = user_priv;
   78|  76.1k|  img->img_data = yv12->buffer_alloc;
   79|  76.1k|  img->img_data_owner = 0;
   80|  76.1k|  img->self_allocd = 0;
   81|  76.1k|  img->sz = yv12->frame_size;
   82|  76.1k|  assert(!yv12->metadata);
   83|  76.1k|  img->metadata = NULL;
   84|  76.1k|}

av1_free_ref_frame_buffers:
   40|  16.1k|void av1_free_ref_frame_buffers(BufferPool *pool) {
   41|  16.1k|  int i;
   42|       |
   43|   273k|  for (i = 0; i < pool->num_frame_bufs; ++i) {
  ------------------
  |  Branch (43:15): [True: 257k, False: 16.1k]
  ------------------
   44|   257k|    if (pool->frame_bufs[i].ref_count > 0 &&
  ------------------
  |  Branch (44:9): [True: 30.6k, False: 227k]
  ------------------
   45|   257k|        pool->frame_bufs[i].raw_frame_buffer.data != NULL) {
  ------------------
  |  Branch (45:9): [True: 30.6k, False: 0]
  ------------------
   46|  30.6k|      pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer);
   47|  30.6k|      pool->frame_bufs[i].raw_frame_buffer.data = NULL;
   48|  30.6k|      pool->frame_bufs[i].raw_frame_buffer.size = 0;
   49|  30.6k|      pool->frame_bufs[i].raw_frame_buffer.priv = NULL;
   50|  30.6k|      pool->frame_bufs[i].ref_count = 0;
   51|  30.6k|    }
   52|   257k|    aom_free(pool->frame_bufs[i].mvs);
   53|   257k|    pool->frame_bufs[i].mvs = NULL;
   54|   257k|    aom_free(pool->frame_bufs[i].seg_map);
   55|   257k|    pool->frame_bufs[i].seg_map = NULL;
   56|   257k|    aom_free_frame_buffer(&pool->frame_bufs[i].buf);
   57|   257k|  }
   58|  16.1k|  aom_free(pool->frame_bufs);
   59|  16.1k|  pool->frame_bufs = NULL;
   60|  16.1k|  pool->num_frame_bufs = 0;
   61|  16.1k|}
av1_free_cdef_buffers:
  124|  16.1k|                           AV1CdefSync *cdef_sync) {
  125|  16.1k|  CdefInfo *cdef_info = &cm->cdef_info;
  126|  16.1k|  const int num_mi_rows = cdef_info->allocated_mi_rows;
  127|       |
  128|  64.4k|  for (int plane = 0; plane < MAX_MB_PLANE; plane++) {
  ------------------
  |  |   36|  64.4k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (128:23): [True: 48.3k, False: 16.1k]
  ------------------
  129|  48.3k|    aom_free(cdef_info->linebuf[plane]);
  130|  48.3k|    cdef_info->linebuf[plane] = NULL;
  131|  48.3k|  }
  132|       |  // De-allocation of column buffer & source buffer (worker_0).
  133|  16.1k|  free_cdef_bufs(cdef_info->colbuf, &cdef_info->srcbuf);
  134|       |
  135|  16.1k|  free_cdef_row_sync(&cdef_sync->cdef_row_mt, num_mi_rows);
  136|       |
  137|  16.1k|  if (cdef_info->allocated_num_workers < 2) return;
  ------------------
  |  Branch (137:7): [True: 13.2k, False: 2.82k]
  ------------------
  138|  2.82k|  if (*cdef_worker != NULL) {
  ------------------
  |  Branch (138:7): [True: 1.59k, False: 1.23k]
  ------------------
  139|  51.4k|    for (int idx = cdef_info->allocated_num_workers - 1; idx >= 1; idx--) {
  ------------------
  |  Branch (139:58): [True: 49.8k, False: 1.59k]
  ------------------
  140|       |      // De-allocation of column buffer & source buffer for remaining workers.
  141|  49.8k|      free_cdef_bufs((*cdef_worker)[idx].colbuf, &(*cdef_worker)[idx].srcbuf);
  142|  49.8k|    }
  143|  1.59k|    aom_free(*cdef_worker);
  144|  1.59k|    *cdef_worker = NULL;
  145|  1.59k|  }
  146|  2.82k|}
av1_alloc_cdef_buffers:
  195|  91.1k|                            int init_worker) {
  196|  91.1k|  const int num_planes = av1_num_planes(cm);
  197|  91.1k|  size_t new_linebuf_size[MAX_MB_PLANE] = { 0 };
  198|  91.1k|  size_t new_colbuf_size[MAX_MB_PLANE] = { 0 };
  199|  91.1k|  size_t new_srcbuf_size = 0;
  200|  91.1k|  CdefInfo *const cdef_info = &cm->cdef_info;
  201|       |  // Check for configuration change
  202|  91.1k|  const int num_mi_rows =
  203|  91.1k|      (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  91.1k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  91.1k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
                    (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  91.1k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  91.1k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  204|  91.1k|  const int is_num_workers_changed =
  205|  91.1k|      cdef_info->allocated_num_workers != num_workers;
  206|  91.1k|  const int is_cdef_enabled =
  207|  91.1k|      cm->seq_params->enable_cdef && !cm->tiles.single_tile_decoding;
  ------------------
  |  Branch (207:7): [True: 54.4k, False: 36.7k]
  |  Branch (207:38): [True: 53.8k, False: 552]
  ------------------
  208|       |
  209|       |  // num-bufs=3 represents ping-pong buffers for top linebuf,
  210|       |  // followed by bottom linebuf.
  211|       |  // ping-pong is to avoid top linebuf over-write by consecutive row.
  212|  91.1k|  int num_bufs = 3;
  213|  91.1k|  if (num_workers > 1)
  ------------------
  |  Branch (213:7): [True: 60.3k, False: 30.7k]
  ------------------
  214|  60.3k|    num_bufs = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  60.3k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  60.3k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
                  num_bufs = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  60.3k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  60.3k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  215|       |
  216|  91.1k|  if (is_cdef_enabled) {
  ------------------
  |  Branch (216:7): [True: 53.8k, False: 37.2k]
  ------------------
  217|       |    // Calculate src buffer size
  218|  53.8k|    new_srcbuf_size = sizeof(*cdef_info->srcbuf) * CDEF_INBUF_SIZE;
  ------------------
  |  |   32|  53.8k|  (CDEF_BSTRIDE * ((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_VBORDER))
  |  |  ------------------
  |  |  |  |   28|  53.8k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  |  |  ------------------
  |  |  |  |  |  |   69|  53.8k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (CDEF_BSTRIDE * ((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_VBORDER))
  |  |  ------------------
  |  |  |  |   31|  53.8k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  |  |                 (CDEF_BSTRIDE * ((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_VBORDER))
  |  |  ------------------
  |  |  |  |   23|  53.8k|#define CDEF_VBORDER (2)
  |  |  ------------------
  ------------------
  219|   206k|    for (int plane = 0; plane < num_planes; plane++) {
  ------------------
  |  Branch (219:25): [True: 152k, False: 53.8k]
  ------------------
  220|   152k|      const int shift =
  221|   152k|          plane == AOM_PLANE_Y ? 0 : cm->seq_params->subsampling_x;
  ------------------
  |  |  226|   152k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  |  Branch (221:11): [True: 53.8k, False: 98.5k]
  ------------------
  222|       |      // Calculate top and bottom line buffer size
  223|   152k|      const int luma_stride =
  224|   152k|          ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4);
  ------------------
  |  |   69|   152k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
  225|   152k|      new_linebuf_size[plane] = sizeof(*cdef_info->linebuf) * num_bufs *
  226|   152k|                                (CDEF_VBORDER << 1) * (luma_stride >> shift);
  ------------------
  |  |   23|   152k|#define CDEF_VBORDER (2)
  ------------------
  227|       |      // Calculate column buffer size
  228|   152k|      const int block_height =
  229|   152k|          (CDEF_BLOCKSIZE << (MI_SIZE_LOG2 - shift)) * 2 * CDEF_VBORDER;
  ------------------
  |  |   17|   152k|#define CDEF_BLOCKSIZE 64
  ------------------
                        (CDEF_BLOCKSIZE << (MI_SIZE_LOG2 - shift)) * 2 * CDEF_VBORDER;
  ------------------
  |  |   39|   152k|#define MI_SIZE_LOG2 2
  ------------------
                        (CDEF_BLOCKSIZE << (MI_SIZE_LOG2 - shift)) * 2 * CDEF_VBORDER;
  ------------------
  |  |   23|   152k|#define CDEF_VBORDER (2)
  ------------------
  230|   152k|      new_colbuf_size[plane] =
  231|   152k|          sizeof(*cdef_info->colbuf[plane]) * block_height * CDEF_HBORDER;
  ------------------
  |  |   26|   152k|#define CDEF_HBORDER (8)
  ------------------
  232|   152k|    }
  233|  53.8k|  }
  234|       |
  235|       |  // Free src, line and column buffers for worker 0 in case of reallocation
  236|  91.1k|  free_cdef_linebuf_conditional(cm, new_linebuf_size);
  237|  91.1k|  free_cdef_bufs_conditional(cm, cdef_info->colbuf, &cdef_info->srcbuf,
  238|  91.1k|                             new_colbuf_size, new_srcbuf_size);
  239|       |
  240|       |  // The flag init_worker indicates if cdef_worker has to be allocated for the
  241|       |  // frame. This is passed as 1 always from decoder. At encoder side, it is 0
  242|       |  // when called for parallel frames during FPMT (where cdef_worker is shared
  243|       |  // across parallel frames) and 1 otherwise.
  244|  91.1k|  if (*cdef_worker != NULL && init_worker) {
  ------------------
  |  Branch (244:7): [True: 39.6k, False: 51.4k]
  |  Branch (244:31): [True: 39.6k, False: 0]
  ------------------
  245|  39.6k|    if (is_num_workers_changed) {
  ------------------
  |  Branch (245:9): [True: 0, False: 39.6k]
  ------------------
  246|       |      // Free src and column buffers for remaining workers in case of change in
  247|       |      // num_workers
  248|      0|      for (int idx = cdef_info->allocated_num_workers - 1; idx >= 1; idx--)
  ------------------
  |  Branch (248:60): [True: 0, False: 0]
  ------------------
  249|      0|        free_cdef_bufs((*cdef_worker)[idx].colbuf, &(*cdef_worker)[idx].srcbuf);
  250|       |
  251|      0|      aom_free(*cdef_worker);
  252|      0|      *cdef_worker = NULL;
  253|  39.6k|    } else if (num_workers > 1) {
  ------------------
  |  Branch (253:16): [True: 39.6k, False: 0]
  ------------------
  254|       |      // Free src and column buffers for remaining workers in case of
  255|       |      // reallocation
  256|  1.48M|      for (int idx = num_workers - 1; idx >= 1; idx--)
  ------------------
  |  Branch (256:39): [True: 1.44M, False: 39.6k]
  ------------------
  257|  1.44M|        free_cdef_bufs_conditional(cm, (*cdef_worker)[idx].colbuf,
  258|  1.44M|                                   &(*cdef_worker)[idx].srcbuf, new_colbuf_size,
  259|  1.44M|                                   new_srcbuf_size);
  260|  39.6k|    }
  261|  39.6k|  }
  262|       |
  263|  91.1k|  if (cdef_info->allocated_mi_rows != num_mi_rows)
  ------------------
  |  Branch (263:7): [True: 10.8k, False: 80.3k]
  ------------------
  264|  10.8k|    free_cdef_row_sync(&cdef_sync->cdef_row_mt, cdef_info->allocated_mi_rows);
  265|       |
  266|       |  // Store allocated sizes for reallocation
  267|  91.1k|  cdef_info->allocated_srcbuf_size = new_srcbuf_size;
  268|  91.1k|  av1_copy(cdef_info->allocated_colbuf_size, new_colbuf_size);
  ------------------
  |  |   31|  91.1k|  do {                                   \
  |  |   32|  91.1k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  91.1k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  91.1k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  269|  91.1k|  av1_copy(cdef_info->allocated_linebuf_size, new_linebuf_size);
  ------------------
  |  |   31|  91.1k|  do {                                   \
  |  |   32|  91.1k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|  91.1k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|  91.1k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  270|       |  // Store configuration to check change in configuration
  271|  91.1k|  cdef_info->allocated_mi_rows = num_mi_rows;
  272|  91.1k|  cdef_info->allocated_num_workers = num_workers;
  273|       |
  274|  91.1k|  if (!is_cdef_enabled) return;
  ------------------
  |  Branch (274:7): [True: 37.2k, False: 53.8k]
  ------------------
  275|       |
  276|       |  // Memory allocation of column buffer & source buffer (worker_0).
  277|  53.8k|  alloc_cdef_bufs(cm, cdef_info->colbuf, &cdef_info->srcbuf, num_planes);
  278|  53.8k|  alloc_cdef_linebuf(cm, cdef_info->linebuf, num_planes);
  279|       |
  280|  53.8k|  if (num_workers < 2) return;
  ------------------
  |  Branch (280:7): [True: 14.0k, False: 39.8k]
  ------------------
  281|       |
  282|  39.8k|  if (init_worker) {
  ------------------
  |  Branch (282:7): [True: 39.8k, False: 0]
  ------------------
  283|  39.8k|    if (*cdef_worker == NULL)
  ------------------
  |  Branch (283:9): [True: 1.59k, False: 38.2k]
  ------------------
  284|  39.8k|      CHECK_MEM_ERROR(cm, *cdef_worker,
  ------------------
  |  |   51|  1.59k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  1.59k|  do {                                                    \
  |  |  |  |   69|  1.59k|    lval = (expr);                                        \
  |  |  |  |   70|  1.59k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 1.59k]
  |  |  |  |  ------------------
  |  |  |  |   71|  1.59k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  1.59k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  285|  39.8k|                      aom_calloc(num_workers, sizeof(**cdef_worker)));
  286|       |
  287|       |    // Memory allocation of column buffer & source buffer for remaining workers.
  288|  1.49M|    for (int idx = num_workers - 1; idx >= 1; idx--)
  ------------------
  |  Branch (288:37): [True: 1.45M, False: 39.8k]
  ------------------
  289|  1.45M|      alloc_cdef_bufs(cm, (*cdef_worker)[idx].colbuf,
  290|  1.45M|                      &(*cdef_worker)[idx].srcbuf, num_planes);
  291|  39.8k|  }
  292|       |
  293|  39.8k|  alloc_cdef_row_sync(cm, &cdef_sync->cdef_row_mt,
  294|  39.8k|                      cdef_info->allocated_mi_rows);
  295|  39.8k|}
av1_alloc_restoration_buffers:
  299|  30.2k|void av1_alloc_restoration_buffers(AV1_COMMON *cm, bool is_sgr_enabled) {
  300|  30.2k|  const int num_planes = av1_num_planes(cm);
  301|       |
  302|  30.2k|  if (cm->rst_tmpbuf == NULL && is_sgr_enabled) {
  ------------------
  |  Branch (302:7): [True: 5.02k, False: 25.1k]
  |  Branch (302:33): [True: 5.02k, False: 0]
  ------------------
  303|  5.02k|    CHECK_MEM_ERROR(cm, cm->rst_tmpbuf,
  ------------------
  |  |   51|  5.02k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  5.02k|  do {                                                    \
  |  |  |  |   69|  5.02k|    lval = (expr);                                        \
  |  |  |  |   70|  5.02k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 5.02k]
  |  |  |  |  ------------------
  |  |  |  |   71|  5.02k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  5.02k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  304|  5.02k|                    (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE));
  305|  5.02k|  }
  306|       |
  307|  30.2k|  if (cm->rlbs == NULL) {
  ------------------
  |  Branch (307:7): [True: 5.02k, False: 25.1k]
  ------------------
  308|  5.02k|    CHECK_MEM_ERROR(cm, cm->rlbs, aom_malloc(sizeof(RestorationLineBuffers)));
  ------------------
  |  |   51|  5.02k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  5.02k|  do {                                                    \
  |  |  |  |   69|  5.02k|    lval = (expr);                                        \
  |  |  |  |   70|  5.02k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 5.02k]
  |  |  |  |  ------------------
  |  |  |  |   71|  5.02k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  5.02k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  309|  5.02k|  }
  310|       |
  311|       |  // For striped loop restoration, we divide each plane into "stripes",
  312|       |  // of height 64 luma pixels but with an offset by RESTORATION_UNIT_OFFSET
  313|       |  // luma pixels to match the output from CDEF. We will need to store 2 *
  314|       |  // RESTORATION_CTX_VERT lines of data for each stripe.
  315|  30.2k|  int mi_h = cm->mi_params.mi_rows;
  316|  30.2k|  const int ext_h = RESTORATION_UNIT_OFFSET + (mi_h << MI_SIZE_LOG2);
  ------------------
  |  |   37|  30.2k|#define RESTORATION_UNIT_OFFSET 8
  ------------------
                const int ext_h = RESTORATION_UNIT_OFFSET + (mi_h << MI_SIZE_LOG2);
  ------------------
  |  |   39|  30.2k|#define MI_SIZE_LOG2 2
  ------------------
  317|  30.2k|  const int num_stripes = (ext_h + 63) / 64;
  318|       |
  319|       |  // Now we need to allocate enough space to store the line buffers for the
  320|       |  // stripes
  321|  30.2k|  const int frame_w = cm->superres_upscaled_width;
  322|  30.2k|  const int use_highbd = cm->seq_params->use_highbitdepth;
  323|       |
  324|   111k|  for (int p = 0; p < num_planes; ++p) {
  ------------------
  |  Branch (324:19): [True: 81.7k, False: 30.2k]
  ------------------
  325|  81.7k|    const int is_uv = p > 0;
  326|  81.7k|    const int ss_x = is_uv && cm->seq_params->subsampling_x;
  ------------------
  |  Branch (326:22): [True: 51.5k, False: 30.2k]
  |  Branch (326:31): [True: 44.7k, False: 6.80k]
  ------------------
  327|  81.7k|    const int plane_w = ((frame_w + ss_x) >> ss_x) + 2 * RESTORATION_EXTRA_HORZ;
  ------------------
  |  |   70|  81.7k|#define RESTORATION_EXTRA_HORZ 4
  ------------------
  328|  81.7k|    const int stride = ALIGN_POWER_OF_TWO(plane_w, 5);
  ------------------
  |  |   69|  81.7k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
  329|  81.7k|    const int buf_size = num_stripes * stride * RESTORATION_CTX_VERT
  ------------------
  |  |   66|  81.7k|#define RESTORATION_CTX_VERT 2
  ------------------
  330|  81.7k|                         << use_highbd;
  331|  81.7k|    RestorationStripeBoundaries *boundaries = &cm->rst_info[p].boundaries;
  332|       |
  333|  81.7k|    if (buf_size != boundaries->stripe_boundary_size ||
  ------------------
  |  Branch (333:9): [True: 31.3k, False: 50.4k]
  ------------------
  334|  81.7k|        boundaries->stripe_boundary_above == NULL ||
  ------------------
  |  Branch (334:9): [True: 0, False: 50.4k]
  ------------------
  335|  81.7k|        boundaries->stripe_boundary_below == NULL) {
  ------------------
  |  Branch (335:9): [True: 0, False: 50.4k]
  ------------------
  336|  31.3k|      aom_free(boundaries->stripe_boundary_above);
  337|  31.3k|      aom_free(boundaries->stripe_boundary_below);
  338|       |
  339|  31.3k|      CHECK_MEM_ERROR(cm, boundaries->stripe_boundary_above,
  ------------------
  |  |   51|  31.3k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  31.3k|  do {                                                    \
  |  |  |  |   69|  31.3k|    lval = (expr);                                        \
  |  |  |  |   70|  31.3k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 31.3k]
  |  |  |  |  ------------------
  |  |  |  |   71|  31.3k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  31.3k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  340|  31.3k|                      (uint8_t *)aom_memalign(32, buf_size));
  341|  31.3k|      CHECK_MEM_ERROR(cm, boundaries->stripe_boundary_below,
  ------------------
  |  |   51|  31.3k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  31.3k|  do {                                                    \
  |  |  |  |   69|  31.3k|    lval = (expr);                                        \
  |  |  |  |   70|  31.3k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 31.3k]
  |  |  |  |  ------------------
  |  |  |  |   71|  31.3k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  31.3k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  342|  31.3k|                      (uint8_t *)aom_memalign(32, buf_size));
  343|       |
  344|  31.3k|      boundaries->stripe_boundary_size = buf_size;
  345|  31.3k|    }
  346|  81.7k|    boundaries->stripe_boundary_stride = stride;
  347|  81.7k|  }
  348|  30.2k|}
av1_free_restoration_buffers:
  350|  16.1k|void av1_free_restoration_buffers(AV1_COMMON *cm) {
  351|  16.1k|  int p;
  352|  64.4k|  for (p = 0; p < MAX_MB_PLANE; ++p)
  ------------------
  |  |   36|  64.4k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (352:15): [True: 48.3k, False: 16.1k]
  ------------------
  353|  48.3k|    av1_free_restoration_struct(&cm->rst_info[p]);
  354|  16.1k|  aom_free(cm->rst_tmpbuf);
  355|  16.1k|  cm->rst_tmpbuf = NULL;
  356|  16.1k|  aom_free(cm->rlbs);
  357|  16.1k|  cm->rlbs = NULL;
  358|  64.4k|  for (p = 0; p < MAX_MB_PLANE; ++p) {
  ------------------
  |  |   36|  64.4k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (358:15): [True: 48.3k, False: 16.1k]
  ------------------
  359|  48.3k|    RestorationStripeBoundaries *boundaries = &cm->rst_info[p].boundaries;
  360|  48.3k|    aom_free(boundaries->stripe_boundary_above);
  361|  48.3k|    aom_free(boundaries->stripe_boundary_below);
  362|  48.3k|    boundaries->stripe_boundary_above = NULL;
  363|  48.3k|    boundaries->stripe_boundary_below = NULL;
  364|  48.3k|  }
  365|       |
  366|  16.1k|  aom_free_frame_buffer(&cm->rst_frame);
  367|  16.1k|}
av1_free_above_context_buffers:
  370|  51.7k|void av1_free_above_context_buffers(CommonContexts *above_contexts) {
  371|  51.7k|  int i;
  372|  51.7k|  const int num_planes = above_contexts->num_planes;
  373|       |
  374|  98.9k|  for (int tile_row = 0; tile_row < above_contexts->num_tile_rows; tile_row++) {
  ------------------
  |  Branch (374:26): [True: 47.2k, False: 51.7k]
  ------------------
  375|   185k|    for (i = 0; i < num_planes; i++) {
  ------------------
  |  Branch (375:17): [True: 138k, False: 47.2k]
  ------------------
  376|   138k|      if (above_contexts->entropy[i] == NULL) break;
  ------------------
  |  Branch (376:11): [True: 0, False: 138k]
  ------------------
  377|   138k|      aom_free(above_contexts->entropy[i][tile_row]);
  378|   138k|      above_contexts->entropy[i][tile_row] = NULL;
  379|   138k|    }
  380|  47.2k|    if (above_contexts->partition != NULL) {
  ------------------
  |  Branch (380:9): [True: 47.2k, False: 0]
  ------------------
  381|  47.2k|      aom_free(above_contexts->partition[tile_row]);
  382|  47.2k|      above_contexts->partition[tile_row] = NULL;
  383|  47.2k|    }
  384|       |
  385|  47.2k|    if (above_contexts->txfm != NULL) {
  ------------------
  |  Branch (385:9): [True: 47.2k, False: 0]
  ------------------
  386|  47.2k|      aom_free(above_contexts->txfm[tile_row]);
  387|  47.2k|      above_contexts->txfm[tile_row] = NULL;
  388|  47.2k|    }
  389|  47.2k|  }
  390|   106k|  for (i = 0; i < num_planes; i++) {
  ------------------
  |  Branch (390:15): [True: 54.7k, False: 51.7k]
  ------------------
  391|  54.7k|    aom_free(above_contexts->entropy[i]);
  392|  54.7k|    above_contexts->entropy[i] = NULL;
  393|  54.7k|  }
  394|  51.7k|  aom_free(above_contexts->partition);
  395|  51.7k|  above_contexts->partition = NULL;
  396|       |
  397|  51.7k|  aom_free(above_contexts->txfm);
  398|  51.7k|  above_contexts->txfm = NULL;
  399|       |
  400|  51.7k|  above_contexts->num_tile_rows = 0;
  401|  51.7k|  above_contexts->num_mi_cols = 0;
  402|  51.7k|  above_contexts->num_planes = 0;
  403|  51.7k|}
av1_free_context_buffers:
  405|  32.4k|void av1_free_context_buffers(AV1_COMMON *cm) {
  406|  32.4k|  if (cm->mi_params.free_mi != NULL) cm->mi_params.free_mi(&cm->mi_params);
  ------------------
  |  Branch (406:7): [True: 32.4k, False: 0]
  ------------------
  407|       |
  408|  32.4k|  av1_free_above_context_buffers(&cm->above_contexts);
  409|  32.4k|}
av1_alloc_above_context_buffers:
  413|  19.2k|                                    int num_planes) {
  414|  19.2k|  const int aligned_mi_cols =
  415|  19.2k|      ALIGN_POWER_OF_TWO(num_mi_cols, MAX_MIB_SIZE_LOG2);
  ------------------
  |  |   69|  19.2k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
  416|       |
  417|       |  // Allocate above context buffers
  418|  19.2k|  above_contexts->num_tile_rows = num_tile_rows;
  419|  19.2k|  above_contexts->num_mi_cols = aligned_mi_cols;
  420|  19.2k|  above_contexts->num_planes = num_planes;
  421|  74.0k|  for (int plane_idx = 0; plane_idx < num_planes; plane_idx++) {
  ------------------
  |  Branch (421:27): [True: 54.7k, False: 19.2k]
  ------------------
  422|  54.7k|    above_contexts->entropy[plane_idx] = (ENTROPY_CONTEXT **)aom_calloc(
  423|  54.7k|        num_tile_rows, sizeof(above_contexts->entropy[0]));
  424|  54.7k|    if (!above_contexts->entropy[plane_idx]) return 1;
  ------------------
  |  Branch (424:9): [True: 0, False: 54.7k]
  ------------------
  425|  54.7k|  }
  426|       |
  427|  19.2k|  above_contexts->partition = (PARTITION_CONTEXT **)aom_calloc(
  428|  19.2k|      num_tile_rows, sizeof(above_contexts->partition));
  429|  19.2k|  if (!above_contexts->partition) return 1;
  ------------------
  |  Branch (429:7): [True: 0, False: 19.2k]
  ------------------
  430|       |
  431|  19.2k|  above_contexts->txfm =
  432|  19.2k|      (TXFM_CONTEXT **)aom_calloc(num_tile_rows, sizeof(above_contexts->txfm));
  433|  19.2k|  if (!above_contexts->txfm) return 1;
  ------------------
  |  Branch (433:7): [True: 0, False: 19.2k]
  ------------------
  434|       |
  435|  66.5k|  for (int tile_row = 0; tile_row < num_tile_rows; tile_row++) {
  ------------------
  |  Branch (435:26): [True: 47.2k, False: 19.2k]
  ------------------
  436|   185k|    for (int plane_idx = 0; plane_idx < num_planes; plane_idx++) {
  ------------------
  |  Branch (436:29): [True: 138k, False: 47.2k]
  ------------------
  437|   138k|      above_contexts->entropy[plane_idx][tile_row] =
  438|   138k|          (ENTROPY_CONTEXT *)aom_calloc(
  439|   138k|              aligned_mi_cols, sizeof(*above_contexts->entropy[0][tile_row]));
  440|   138k|      if (!above_contexts->entropy[plane_idx][tile_row]) return 1;
  ------------------
  |  Branch (440:11): [True: 0, False: 138k]
  ------------------
  441|   138k|    }
  442|       |
  443|  47.2k|    above_contexts->partition[tile_row] = (PARTITION_CONTEXT *)aom_calloc(
  444|  47.2k|        aligned_mi_cols, sizeof(*above_contexts->partition[tile_row]));
  445|  47.2k|    if (!above_contexts->partition[tile_row]) return 1;
  ------------------
  |  Branch (445:9): [True: 0, False: 47.2k]
  ------------------
  446|       |
  447|  47.2k|    above_contexts->txfm[tile_row] = (TXFM_CONTEXT *)aom_calloc(
  448|  47.2k|        aligned_mi_cols, sizeof(*above_contexts->txfm[tile_row]));
  449|  47.2k|    if (!above_contexts->txfm[tile_row]) return 1;
  ------------------
  |  Branch (449:9): [True: 0, False: 47.2k]
  ------------------
  450|  47.2k|  }
  451|       |
  452|  19.2k|  return 0;
  453|  19.2k|}
av1_alloc_context_buffers:
  488|  56.1k|                              BLOCK_SIZE min_partition_size) {
  489|  56.1k|  CommonModeInfoParams *const mi_params = &cm->mi_params;
  490|  56.1k|  mi_params->set_mb_mi(mi_params, width, height, min_partition_size);
  491|  56.1k|  if (alloc_mi(mi_params)) goto fail;
  ------------------
  |  Branch (491:7): [True: 262, False: 55.8k]
  ------------------
  492|  55.8k|  return 0;
  493|       |
  494|    262|fail:
  495|       |  // clear the mi_* values to force a realloc on resync
  496|    262|  mi_params->set_mb_mi(mi_params, 0, 0, BLOCK_4X4);
  497|    262|  av1_free_context_buffers(cm);
  498|    262|  return 1;
  499|  56.1k|}
av1_remove_common:
  501|  32.2k|void av1_remove_common(AV1_COMMON *cm) {
  502|  32.2k|  av1_free_context_buffers(cm);
  503|       |
  504|  32.2k|  aom_free(cm->fc);
  505|  32.2k|  cm->fc = NULL;
  506|  32.2k|  aom_free(cm->default_frame_context);
  507|  32.2k|  cm->default_frame_context = NULL;
  508|  32.2k|}
av1_init_mi_buffers:
  510|  95.4k|void av1_init_mi_buffers(CommonModeInfoParams *mi_params) {
  511|  95.4k|  mi_params->setup_mi(mi_params);
  512|  95.4k|}
alloccommon.c:free_cdef_bufs:
   92|  65.9k|static inline void free_cdef_bufs(uint16_t **colbuf, uint16_t **srcbuf) {
   93|  65.9k|  aom_free(*srcbuf);
   94|  65.9k|  *srcbuf = NULL;
   95|   263k|  for (int plane = 0; plane < MAX_MB_PLANE; plane++) {
  ------------------
  |  |   36|   263k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (95:23): [True: 197k, False: 65.9k]
  ------------------
   96|   197k|    aom_free(colbuf[plane]);
   97|   197k|    colbuf[plane] = NULL;
   98|   197k|  }
   99|  65.9k|}
alloccommon.c:free_cdef_row_sync:
  102|  26.9k|                                      const int num_mi_rows) {
  103|  26.9k|  if (*cdef_row_mt == NULL) return;
  ------------------
  |  Branch (103:7): [True: 23.6k, False: 3.31k]
  ------------------
  104|  3.31k|#if CONFIG_MULTITHREAD
  105|  21.3k|  for (int row_idx = 0; row_idx < num_mi_rows; row_idx++) {
  ------------------
  |  Branch (105:25): [True: 18.0k, False: 3.31k]
  ------------------
  106|  18.0k|    if ((*cdef_row_mt)[row_idx].row_mutex_ != NULL) {
  ------------------
  |  Branch (106:9): [True: 18.0k, False: 0]
  ------------------
  107|  18.0k|      pthread_mutex_destroy((*cdef_row_mt)[row_idx].row_mutex_);
  108|  18.0k|      aom_free((*cdef_row_mt)[row_idx].row_mutex_);
  109|  18.0k|    }
  110|  18.0k|    if ((*cdef_row_mt)[row_idx].row_cond_ != NULL) {
  ------------------
  |  Branch (110:9): [True: 18.0k, False: 0]
  ------------------
  111|  18.0k|      pthread_cond_destroy((*cdef_row_mt)[row_idx].row_cond_);
  112|  18.0k|      aom_free((*cdef_row_mt)[row_idx].row_cond_);
  113|  18.0k|    }
  114|  18.0k|  }
  115|       |#else
  116|       |  (void)num_mi_rows;
  117|       |#endif  // CONFIG_MULTITHREAD
  118|  3.31k|  aom_free(*cdef_row_mt);
  119|  3.31k|  *cdef_row_mt = NULL;
  120|  3.31k|}
alloccommon.c:free_cdef_linebuf_conditional:
   64|  91.1k|    AV1_COMMON *const cm, const size_t *new_linebuf_size) {
   65|  91.1k|  CdefInfo *cdef_info = &cm->cdef_info;
   66|   364k|  for (int plane = 0; plane < MAX_MB_PLANE; plane++) {
  ------------------
  |  |   36|   364k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (66:23): [True: 273k, False: 91.1k]
  ------------------
   67|   273k|    if (new_linebuf_size[plane] != cdef_info->allocated_linebuf_size[plane]) {
  ------------------
  |  Branch (67:9): [True: 22.6k, False: 250k]
  ------------------
   68|  22.6k|      aom_free(cdef_info->linebuf[plane]);
   69|  22.6k|      cdef_info->linebuf[plane] = NULL;
   70|  22.6k|    }
   71|   273k|  }
   72|  91.1k|}
alloccommon.c:free_cdef_bufs_conditional:
   78|  1.53M|                                              const size_t new_srcbuf_size) {
   79|  1.53M|  CdefInfo *cdef_info = &cm->cdef_info;
   80|  1.53M|  if (new_srcbuf_size != cdef_info->allocated_srcbuf_size) {
  ------------------
  |  Branch (80:7): [True: 47.9k, False: 1.48M]
  ------------------
   81|  47.9k|    aom_free(*srcbuf);
   82|  47.9k|    *srcbuf = NULL;
   83|  47.9k|  }
   84|  6.13M|  for (int plane = 0; plane < MAX_MB_PLANE; plane++) {
  ------------------
  |  |   36|  6.13M|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (84:23): [True: 4.60M, False: 1.53M]
  ------------------
   85|  4.60M|    if (new_colbuf_size[plane] != cdef_info->allocated_colbuf_size[plane]) {
  ------------------
  |  Branch (85:9): [True: 55.9k, False: 4.54M]
  ------------------
   86|  55.9k|      aom_free(colbuf[plane]);
   87|  55.9k|      colbuf[plane] = NULL;
   88|  55.9k|    }
   89|  4.60M|  }
   90|  1.53M|}
alloccommon.c:alloc_cdef_bufs:
  159|  1.50M|                                   uint16_t **srcbuf, const int num_planes) {
  160|  1.50M|  CdefInfo *cdef_info = &cm->cdef_info;
  161|  1.50M|  if (*srcbuf == NULL)
  ------------------
  |  Branch (161:7): [True: 74.8k, False: 1.43M]
  ------------------
  162|  1.50M|    CHECK_MEM_ERROR(cm, *srcbuf,
  ------------------
  |  |   51|  74.8k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  74.8k|  do {                                                    \
  |  |  |  |   69|  74.8k|    lval = (expr);                                        \
  |  |  |  |   70|  74.8k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 74.8k]
  |  |  |  |  ------------------
  |  |  |  |   71|  74.8k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  74.8k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  163|  1.50M|                    aom_memalign(16, cdef_info->allocated_srcbuf_size));
  164|       |
  165|  5.89M|  for (int plane = 0; plane < num_planes; plane++) {
  ------------------
  |  Branch (165:23): [True: 4.39M, False: 1.50M]
  ------------------
  166|  4.39M|    if (colbuf[plane] == NULL)
  ------------------
  |  Branch (166:9): [True: 173k, False: 4.21M]
  ------------------
  167|  4.39M|      CHECK_MEM_ERROR(cm, colbuf[plane],
  ------------------
  |  |   51|   173k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|   173k|  do {                                                    \
  |  |  |  |   69|   173k|    lval = (expr);                                        \
  |  |  |  |   70|   173k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 173k]
  |  |  |  |  ------------------
  |  |  |  |   71|   173k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|   173k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  168|  4.39M|                      aom_malloc(cdef_info->allocated_colbuf_size[plane]));
  169|  4.39M|  }
  170|  1.50M|}
alloccommon.c:alloc_cdef_linebuf:
  149|  53.8k|                                      const int num_planes) {
  150|  53.8k|  CdefInfo *cdef_info = &cm->cdef_info;
  151|   206k|  for (int plane = 0; plane < num_planes; plane++) {
  ------------------
  |  Branch (151:23): [True: 152k, False: 53.8k]
  ------------------
  152|   152k|    if (linebuf[plane] == NULL)
  ------------------
  |  Branch (152:9): [True: 21.7k, False: 130k]
  ------------------
  153|   152k|      CHECK_MEM_ERROR(cm, linebuf[plane],
  ------------------
  |  |   51|  21.7k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  21.7k|  do {                                                    \
  |  |  |  |   69|  21.7k|    lval = (expr);                                        \
  |  |  |  |   70|  21.7k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 21.7k]
  |  |  |  |  ------------------
  |  |  |  |   71|  21.7k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  21.7k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  154|   152k|                      aom_malloc(cdef_info->allocated_linebuf_size[plane]));
  155|   152k|  }
  156|  53.8k|}
alloccommon.c:alloc_cdef_row_sync:
  174|  39.8k|                                       const int num_mi_rows) {
  175|  39.8k|  if (*cdef_row_mt != NULL) return;
  ------------------
  |  Branch (175:7): [True: 36.5k, False: 3.31k]
  ------------------
  176|       |
  177|  3.31k|  CHECK_MEM_ERROR(cm, *cdef_row_mt,
  ------------------
  |  |   51|  3.31k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  3.31k|  do {                                                    \
  |  |  |  |   69|  3.31k|    lval = (expr);                                        \
  |  |  |  |   70|  3.31k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 3.31k]
  |  |  |  |  ------------------
  |  |  |  |   71|  3.31k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  3.31k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  178|  3.31k|                  aom_calloc(num_mi_rows, sizeof(**cdef_row_mt)));
  179|  3.31k|#if CONFIG_MULTITHREAD
  180|  21.3k|  for (int row_idx = 0; row_idx < num_mi_rows; row_idx++) {
  ------------------
  |  Branch (180:25): [True: 18.0k, False: 3.31k]
  ------------------
  181|  18.0k|    CHECK_MEM_ERROR(cm, (*cdef_row_mt)[row_idx].row_mutex_,
  ------------------
  |  |   51|  18.0k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  18.0k|  do {                                                    \
  |  |  |  |   69|  18.0k|    lval = (expr);                                        \
  |  |  |  |   70|  18.0k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 18.0k]
  |  |  |  |  ------------------
  |  |  |  |   71|  18.0k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  18.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  182|  18.0k|                    aom_malloc(sizeof(*(*cdef_row_mt)[row_idx].row_mutex_)));
  183|  18.0k|    pthread_mutex_init((*cdef_row_mt)[row_idx].row_mutex_, NULL);
  184|       |
  185|  18.0k|    CHECK_MEM_ERROR(cm, (*cdef_row_mt)[row_idx].row_cond_,
  ------------------
  |  |   51|  18.0k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  18.0k|  do {                                                    \
  |  |  |  |   69|  18.0k|    lval = (expr);                                        \
  |  |  |  |   70|  18.0k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 18.0k]
  |  |  |  |  ------------------
  |  |  |  |   71|  18.0k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  18.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  186|  18.0k|                    aom_malloc(sizeof(*(*cdef_row_mt)[row_idx].row_cond_)));
  187|  18.0k|    pthread_cond_init((*cdef_row_mt)[row_idx].row_cond_, NULL);
  188|  18.0k|  }
  189|  3.31k|#endif  // CONFIG_MULTITHREAD
  190|  3.31k|}
alloccommon.c:alloc_mi:
  458|  56.1k|static int alloc_mi(CommonModeInfoParams *mi_params) {
  459|  56.1k|  const int aligned_mi_rows = calc_mi_size(mi_params->mi_rows);
  460|  56.1k|  const int mi_grid_size = mi_params->mi_stride * aligned_mi_rows;
  461|  56.1k|  const int alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
  462|  56.1k|  const int alloc_mi_size =
  463|  56.1k|      mi_params->mi_alloc_stride * (aligned_mi_rows / alloc_size_1d);
  464|       |
  465|  56.1k|  if (mi_params->mi_alloc_size < alloc_mi_size ||
  ------------------
  |  Branch (465:7): [True: 19.2k, False: 36.8k]
  ------------------
  466|  56.1k|      mi_params->mi_grid_size < mi_grid_size) {
  ------------------
  |  Branch (466:7): [True: 0, False: 36.8k]
  ------------------
  467|  19.2k|    mi_params->free_mi(mi_params);
  468|       |
  469|  19.2k|    mi_params->mi_alloc =
  470|  19.2k|        aom_calloc(alloc_mi_size, sizeof(*mi_params->mi_alloc));
  471|  19.2k|    if (!mi_params->mi_alloc) return 1;
  ------------------
  |  Branch (471:9): [True: 262, False: 19.0k]
  ------------------
  472|  19.0k|    mi_params->mi_alloc_size = alloc_mi_size;
  473|       |
  474|  19.0k|    mi_params->mi_grid_base = (MB_MODE_INFO **)aom_calloc(
  475|  19.0k|        mi_grid_size, sizeof(*mi_params->mi_grid_base));
  476|  19.0k|    if (!mi_params->mi_grid_base) return 1;
  ------------------
  |  Branch (476:9): [True: 0, False: 19.0k]
  ------------------
  477|       |
  478|  19.0k|    mi_params->tx_type_map =
  479|  19.0k|        aom_calloc(mi_grid_size, sizeof(*mi_params->tx_type_map));
  480|  19.0k|    if (!mi_params->tx_type_map) return 1;
  ------------------
  |  Branch (480:9): [True: 0, False: 19.0k]
  ------------------
  481|  19.0k|    mi_params->mi_grid_size = mi_grid_size;
  482|  19.0k|  }
  483|       |
  484|  55.8k|  return 0;
  485|  56.1k|}

av1_dx_iface.c:lock_buffer_pool:
 1082|   272k|static void lock_buffer_pool(BufferPool *const pool) {
 1083|   272k|#if CONFIG_MULTITHREAD
 1084|   272k|  pthread_mutex_lock(&pool->pool_mutex);
 1085|       |#else
 1086|       |  (void)pool;
 1087|       |#endif
 1088|   272k|}
av1_dx_iface.c:unlock_buffer_pool:
 1090|   272k|static void unlock_buffer_pool(BufferPool *const pool) {
 1091|   272k|#if CONFIG_MULTITHREAD
 1092|   272k|  pthread_mutex_unlock(&pool->pool_mutex);
 1093|       |#else
 1094|       |  (void)pool;
 1095|       |#endif
 1096|   272k|}
av1_dx_iface.c:frame_is_intra_only:
 1174|  40.4k|static inline int frame_is_intra_only(const AV1_COMMON *const cm) {
 1175|  40.4k|  return cm->current_frame.frame_type == KEY_FRAME ||
  ------------------
  |  Branch (1175:10): [True: 21.6k, False: 18.7k]
  ------------------
 1176|  40.4k|         cm->current_frame.frame_type == INTRA_ONLY_FRAME;
  ------------------
  |  Branch (1176:10): [True: 18.7k, False: 0]
  ------------------
 1177|  40.4k|}
av1_dx_iface.c:av1_num_planes:
 1271|  48.7k|static inline int av1_num_planes(const AV1_COMMON *cm) {
 1272|  48.7k|  return cm->seq_params->monochrome ? 1 : MAX_MB_PLANE;
  ------------------
  |  |   36|  48.5k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (1272:10): [True: 175, False: 48.5k]
  ------------------
 1273|  48.7k|}
decodeframe.c:set_sb_size:
 1851|  80.6k|                               BLOCK_SIZE sb_size) {
 1852|  80.6k|  seq_params->sb_size = sb_size;
 1853|  80.6k|  seq_params->mib_size = mi_size_wide[seq_params->sb_size];
 1854|  80.6k|  seq_params->mib_size_log2 = mi_size_wide_log2[seq_params->sb_size];
 1855|  80.6k|}
decodeframe.c:av1_num_planes:
 1271|  84.4M|static inline int av1_num_planes(const AV1_COMMON *cm) {
 1272|  84.4M|  return cm->seq_params->monochrome ? 1 : MAX_MB_PLANE;
  ------------------
  |  |   36|  81.5M|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (1272:10): [True: 2.88M, False: 81.5M]
  ------------------
 1273|  84.4M|}
decodeframe.c:lock_buffer_pool:
 1082|   854k|static void lock_buffer_pool(BufferPool *const pool) {
 1083|   854k|#if CONFIG_MULTITHREAD
 1084|   854k|  pthread_mutex_lock(&pool->pool_mutex);
 1085|       |#else
 1086|       |  (void)pool;
 1087|       |#endif
 1088|   854k|}
decodeframe.c:assign_frame_buffer_p:
 1161|  1.60k|                                         RefCntBuffer *rhs_ptr) {
 1162|  1.60k|  RefCntBuffer *const old_ptr = *lhs_ptr;
 1163|  1.60k|  if (old_ptr != NULL) {
  ------------------
  |  Branch (1163:7): [True: 1.60k, False: 0]
  ------------------
 1164|  1.60k|    assert(old_ptr->ref_count > 0);
 1165|       |    // One less reference to the buffer at 'old_ptr', so decrease ref count.
 1166|  1.60k|    --old_ptr->ref_count;
 1167|  1.60k|  }
 1168|       |
 1169|  1.60k|  *lhs_ptr = rhs_ptr;
 1170|       |  // One more reference to the buffer at 'rhs_ptr', so increase ref count.
 1171|  1.60k|  ++rhs_ptr->ref_count;
 1172|  1.60k|}
decodeframe.c:unlock_buffer_pool:
 1090|   854k|static void unlock_buffer_pool(BufferPool *const pool) {
 1091|   854k|#if CONFIG_MULTITHREAD
 1092|   854k|  pthread_mutex_unlock(&pool->pool_mutex);
 1093|       |#else
 1094|       |  (void)pool;
 1095|       |#endif
 1096|   854k|}
decodeframe.c:frame_is_sframe:
 1179|   454k|static inline int frame_is_sframe(const AV1_COMMON *cm) {
 1180|   454k|  return cm->current_frame.frame_type == S_FRAME;
 1181|   454k|}
decodeframe.c:frame_is_intra_only:
 1174|  1.03M|static inline int frame_is_intra_only(const AV1_COMMON *const cm) {
 1175|  1.03M|  return cm->current_frame.frame_type == KEY_FRAME ||
  ------------------
  |  Branch (1175:10): [True: 515k, False: 522k]
  ------------------
 1176|  1.03M|         cm->current_frame.frame_type == INTRA_ONLY_FRAME;
  ------------------
  |  Branch (1176:10): [True: 130k, False: 392k]
  ------------------
 1177|  1.03M|}
decodeframe.c:get_free_fb:
 1104|   277k|static inline int get_free_fb(AV1_COMMON *cm) {
 1105|   277k|  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
 1106|   277k|  int i;
 1107|       |
 1108|   277k|  lock_buffer_pool(cm->buffer_pool);
 1109|   277k|  const int num_frame_bufs = cm->buffer_pool->num_frame_bufs;
 1110|  1.45M|  for (i = 0; i < num_frame_bufs; ++i)
  ------------------
  |  Branch (1110:15): [True: 1.45M, False: 0]
  ------------------
 1111|  1.45M|    if (frame_bufs[i].ref_count == 0) break;
  ------------------
  |  Branch (1111:9): [True: 277k, False: 1.17M]
  ------------------
 1112|       |
 1113|   277k|  if (i != num_frame_bufs) {
  ------------------
  |  Branch (1113:7): [True: 277k, False: 0]
  ------------------
 1114|   277k|    if (frame_bufs[i].buf.use_external_reference_buffers) {
  ------------------
  |  Branch (1114:9): [True: 0, False: 277k]
  ------------------
 1115|       |      // If this frame buffer's y_buffer, u_buffer, and v_buffer point to the
 1116|       |      // external reference buffers. Restore the buffer pointers to point to the
 1117|       |      // internally allocated memory.
 1118|      0|      YV12_BUFFER_CONFIG *ybf = &frame_bufs[i].buf;
 1119|      0|      ybf->y_buffer = ybf->store_buf_adr[0];
 1120|      0|      ybf->u_buffer = ybf->store_buf_adr[1];
 1121|      0|      ybf->v_buffer = ybf->store_buf_adr[2];
 1122|      0|      ybf->use_external_reference_buffers = 0;
 1123|      0|    }
 1124|       |
 1125|   277k|    frame_bufs[i].ref_count = 1;
 1126|   277k|  } else {
 1127|       |    // We should never run out of free buffers. If this assertion fails, there
 1128|       |    // is a reference leak.
 1129|      0|    assert(0 && "Ran out of free frame buffers. Likely a reference leak.");
 1130|       |    // Reset i to be INVALID_IDX to indicate no free buffer found.
 1131|      0|    i = INVALID_IDX;
  ------------------
  |  |   15|      0|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
 1132|      0|  }
 1133|       |
 1134|   277k|  unlock_buffer_pool(cm->buffer_pool);
 1135|   277k|  return i;
 1136|   277k|}
decodeframe.c:ensure_mv_buffer:
 1235|   203k|static inline void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) {
 1236|   203k|  const int buf_rows = buf->mi_rows;
 1237|   203k|  const int buf_cols = buf->mi_cols;
 1238|   203k|  const CommonModeInfoParams *const mi_params = &cm->mi_params;
 1239|       |
 1240|   203k|  if (buf->mvs == NULL || buf_rows != mi_params->mi_rows ||
  ------------------
  |  Branch (1240:7): [True: 24.8k, False: 178k]
  |  Branch (1240:27): [True: 55.2k, False: 123k]
  ------------------
 1241|   203k|      buf_cols != mi_params->mi_cols) {
  ------------------
  |  Branch (1241:7): [True: 13.3k, False: 110k]
  ------------------
 1242|  93.4k|    aom_free(buf->mvs);
 1243|  93.4k|    buf->mi_rows = mi_params->mi_rows;
 1244|  93.4k|    buf->mi_cols = mi_params->mi_cols;
 1245|  93.4k|    CHECK_MEM_ERROR(cm, buf->mvs,
  ------------------
  |  |   51|  93.4k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  93.4k|  do {                                                    \
  |  |  |  |   69|  93.4k|    lval = (expr);                                        \
  |  |  |  |   70|  93.4k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 93.4k]
  |  |  |  |  ------------------
  |  |  |  |   71|  93.4k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  93.4k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1246|  93.4k|                    (MV_REF *)aom_calloc(((mi_params->mi_rows + 1) >> 1) *
 1247|  93.4k|                                             ((mi_params->mi_cols + 1) >> 1),
 1248|  93.4k|                                         sizeof(*buf->mvs)));
 1249|  93.4k|    aom_free(buf->seg_map);
 1250|  93.4k|    CHECK_MEM_ERROR(
  ------------------
  |  |   51|  93.4k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  93.4k|  do {                                                    \
  |  |  |  |   69|  93.4k|    lval = (expr);                                        \
  |  |  |  |   70|  93.4k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 93.4k]
  |  |  |  |  ------------------
  |  |  |  |   71|  93.4k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  93.4k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1251|  93.4k|        cm, buf->seg_map,
 1252|  93.4k|        (uint8_t *)aom_calloc(mi_params->mi_rows * mi_params->mi_cols,
 1253|  93.4k|                              sizeof(*buf->seg_map)));
 1254|  93.4k|  }
 1255|       |
 1256|   203k|  const int mem_size =
 1257|   203k|      ((mi_params->mi_rows + MAX_MIB_SIZE) >> 1) * (mi_params->mi_stride >> 1);
  ------------------
  |  |   44|   203k|#define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   43|   203k|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|   203k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   39|   203k|#define MI_SIZE_LOG2 2
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1258|       |
 1259|   203k|  if (cm->tpl_mvs == NULL || cm->tpl_mvs_mem_size < mem_size) {
  ------------------
  |  Branch (1259:7): [True: 14.5k, False: 188k]
  |  Branch (1259:30): [True: 5.06k, False: 183k]
  ------------------
 1260|  19.6k|    aom_free(cm->tpl_mvs);
 1261|  19.6k|    CHECK_MEM_ERROR(cm, cm->tpl_mvs,
  ------------------
  |  |   51|  19.6k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  19.6k|  do {                                                    \
  |  |  |  |   69|  19.6k|    lval = (expr);                                        \
  |  |  |  |   70|  19.6k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 19.6k]
  |  |  |  |  ------------------
  |  |  |  |   71|  19.6k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  19.6k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1262|  19.6k|                    (TPL_MV_REF *)aom_calloc(mem_size, sizeof(*cm->tpl_mvs)));
 1263|  19.6k|    cm->tpl_mvs_mem_size = mem_size;
 1264|  19.6k|  }
 1265|   203k|}
decodeframe.c:frame_might_allow_ref_frame_mvs:
 1222|  69.9k|static inline int frame_might_allow_ref_frame_mvs(const AV1_COMMON *cm) {
 1223|  69.9k|  return !cm->features.error_resilient_mode &&
  ------------------
  |  Branch (1223:10): [True: 62.6k, False: 7.26k]
  ------------------
 1224|  69.9k|         cm->seq_params->order_hint_info.enable_ref_frame_mvs &&
  ------------------
  |  Branch (1224:10): [True: 61.7k, False: 926]
  ------------------
 1225|  69.9k|         cm->seq_params->order_hint_info.enable_order_hint &&
  ------------------
  |  Branch (1225:10): [True: 61.7k, False: 0]
  ------------------
 1226|  69.9k|         !frame_is_intra_only(cm);
  ------------------
  |  Branch (1226:10): [True: 61.7k, False: 0]
  ------------------
 1227|  69.9k|}
decodeframe.c:get_ref_frame_buf:
 1194|  8.02M|    const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
 1195|  8.02M|  const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
 1196|  8.02M|  return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL;
  ------------------
  |  |   15|  8.02M|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  |  Branch (1196:10): [True: 8.02M, False: 246]
  ------------------
 1197|  8.02M|}
decodeframe.c:get_ref_frame_map_idx:
 1187|  15.6M|                                        const MV_REFERENCE_FRAME ref_frame) {
 1188|  15.6M|  return (ref_frame >= LAST_FRAME && ref_frame <= EXTREF_FRAME)
  ------------------
  |  Branch (1188:11): [True: 15.6M, False: 18.4E]
  |  Branch (1188:38): [True: 15.6M, False: 18.4E]
  ------------------
 1189|  15.6M|             ? cm->remapped_ref_idx[ref_frame - LAST_FRAME]
 1190|  18.4E|             : INVALID_IDX;
  ------------------
  |  |   15|  18.4E|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
 1191|  15.6M|}
decodeframe.c:get_ref_scale_factors:
 1208|   342k|    AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
 1209|   342k|  const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
 1210|   342k|  return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL;
  ------------------
  |  |   15|   342k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  |  Branch (1210:10): [True: 342k, False: 0]
  ------------------
 1211|   342k|}
decodeframe.c:is_coded_lossless:
 1861|   186k|                                    const MACROBLOCKD *xd) {
 1862|   186k|  int coded_lossless = 1;
 1863|   186k|  if (cm->seg.enabled) {
  ------------------
  |  Branch (1863:7): [True: 23.0k, False: 163k]
  ------------------
 1864|  28.1k|    for (int i = 0; i < MAX_SEGMENTS; ++i) {
  ------------------
  |  |   21|  28.1k|#define MAX_SEGMENTS 8
  ------------------
  |  Branch (1864:21): [True: 27.7k, False: 417]
  ------------------
 1865|  27.7k|      if (!xd->lossless[i]) {
  ------------------
  |  Branch (1865:11): [True: 22.6k, False: 5.10k]
  ------------------
 1866|  22.6k|        coded_lossless = 0;
 1867|  22.6k|        break;
 1868|  22.6k|      }
 1869|  27.7k|    }
 1870|   163k|  } else {
 1871|   163k|    coded_lossless = xd->lossless[0];
 1872|   163k|  }
 1873|   186k|  return coded_lossless;
 1874|   186k|}
decodeframe.c:frame_might_allow_warped_motion:
 1230|   181k|static inline int frame_might_allow_warped_motion(const AV1_COMMON *cm) {
 1231|   181k|  return !cm->features.error_resilient_mode && !frame_is_intra_only(cm) &&
  ------------------
  |  Branch (1231:10): [True: 82.9k, False: 98.9k]
  |  Branch (1231:48): [True: 39.4k, False: 43.5k]
  ------------------
 1232|   181k|         cm->seq_params->enable_warped_motion;
  ------------------
  |  Branch (1232:10): [True: 32.4k, False: 6.92k]
  ------------------
 1233|   181k|}
decodeframe.c:get_primary_ref_frame_buf:
 1214|   208k|    const AV1_COMMON *const cm) {
 1215|   208k|  const int primary_ref_frame = cm->features.primary_ref_frame;
 1216|   208k|  if (primary_ref_frame == PRIMARY_REF_NONE) return NULL;
  ------------------
  |  |   66|   208k|#define PRIMARY_REF_NONE 7
  ------------------
  |  Branch (1216:7): [True: 43.4k, False: 164k]
  ------------------
 1217|   164k|  const int map_idx = get_ref_frame_map_idx(cm, primary_ref_frame + 1);
 1218|   164k|  return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL;
  ------------------
  |  |   15|   164k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  |  Branch (1218:10): [True: 146k, False: 17.8k]
  ------------------
 1219|   208k|}
decodeframe.c:get_ref_scale_factors_const:
 1202|  7.14M|    const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
 1203|  7.14M|  const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
 1204|  7.14M|  return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL;
  ------------------
  |  |   15|  7.14M|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  |  Branch (1204:10): [True: 7.14M, False: 18.4E]
  ------------------
 1205|  7.14M|}
decodeframe.c:av1_init_above_context:
 1277|   162k|                                          MACROBLOCKD *xd) {
 1278|   613k|  for (int i = 0; i < num_planes; ++i) {
  ------------------
  |  Branch (1278:19): [True: 450k, False: 162k]
  ------------------
 1279|   450k|    xd->above_entropy_context[i] = above_contexts->entropy[i][tile_row];
 1280|   450k|  }
 1281|   162k|  xd->above_partition_context = above_contexts->partition[tile_row];
 1282|   162k|  xd->above_txfm_context = above_contexts->txfm[tile_row];
 1283|   162k|}
decodeframe.c:av1_zero_above_context:
 1595|   162k|                                          const int tile_row) {
 1596|   162k|  const SequenceHeader *const seq_params = cm->seq_params;
 1597|   162k|  const int num_planes = av1_num_planes(cm);
 1598|   162k|  const int width = mi_col_end - mi_col_start;
 1599|   162k|  const int aligned_width =
 1600|   162k|      ALIGN_POWER_OF_TWO(width, seq_params->mib_size_log2);
  ------------------
  |  |   69|   162k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
 1601|   162k|  const int offset_y = mi_col_start;
 1602|   162k|  const int width_y = aligned_width;
 1603|   162k|  const int offset_uv = offset_y >> seq_params->subsampling_x;
 1604|   162k|  const int width_uv = width_y >> seq_params->subsampling_x;
 1605|   162k|  CommonContexts *const above_contexts = &cm->above_contexts;
 1606|       |
 1607|   162k|  av1_zero_array(above_contexts->entropy[0][tile_row] + offset_y, width_y);
  ------------------
  |  |   44|   162k|#define av1_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest)))
  ------------------
 1608|   162k|  if (num_planes > 1) {
  ------------------
  |  Branch (1608:7): [True: 144k, False: 18.3k]
  ------------------
 1609|   144k|    if (above_contexts->entropy[1][tile_row] &&
  ------------------
  |  Branch (1609:9): [True: 144k, False: 18.4E]
  ------------------
 1610|   144k|        above_contexts->entropy[2][tile_row]) {
  ------------------
  |  Branch (1610:9): [True: 144k, False: 18.4E]
  ------------------
 1611|   144k|      av1_zero_array(above_contexts->entropy[1][tile_row] + offset_uv,
  ------------------
  |  |   44|   144k|#define av1_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest)))
  ------------------
 1612|   144k|                     width_uv);
 1613|   144k|      av1_zero_array(above_contexts->entropy[2][tile_row] + offset_uv,
  ------------------
  |  |   44|   144k|#define av1_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest)))
  ------------------
 1614|   144k|                     width_uv);
 1615|  18.4E|    } else {
 1616|  18.4E|      aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
 1617|  18.4E|                         "Invalid value of planes");
 1618|  18.4E|    }
 1619|   144k|  }
 1620|       |
 1621|   162k|  av1_zero_array(above_contexts->partition[tile_row] + mi_col_start,
  ------------------
  |  |   44|   162k|#define av1_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest)))
  ------------------
 1622|   162k|                 aligned_width);
 1623|       |
 1624|   162k|  memset(above_contexts->txfm[tile_row] + mi_col_start,
 1625|   162k|         tx_size_wide[TX_SIZES_LARGEST], aligned_width * sizeof(TXFM_CONTEXT));
 1626|   162k|}
decodeframe.c:av1_zero_left_context:
 1628|   293k|static inline void av1_zero_left_context(MACROBLOCKD *const xd) {
 1629|   293k|  av1_zero(xd->left_entropy_context);
  ------------------
  |  |   43|   293k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
 1630|   293k|  av1_zero(xd->left_partition_context);
  ------------------
  |  |   43|   293k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
 1631|       |
 1632|   293k|  memset(xd->left_txfm_context_buffer, tx_size_high[TX_SIZES_LARGEST],
 1633|   293k|         sizeof(xd->left_txfm_context_buffer));
 1634|   293k|}
decodeframe.c:set_mi_offsets:
 1672|  15.6M|                                  int mi_col) {
 1673|       |  // 'mi_grid_base' should point to appropriate memory in 'mi'.
 1674|  15.6M|  const int mi_grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col);
 1675|  15.6M|  const int mi_alloc_idx = get_alloc_mi_idx(mi_params, mi_row, mi_col);
 1676|  15.6M|  mi_params->mi_grid_base[mi_grid_idx] = &mi_params->mi_alloc[mi_alloc_idx];
 1677|       |  // 'xd->mi' should point to an offset in 'mi_grid_base';
 1678|  15.6M|  xd->mi = mi_params->mi_grid_base + mi_grid_idx;
 1679|       |  // 'xd->tx_type_map' should point to an offset in 'mi_params->tx_type_map'.
 1680|  15.6M|  xd->tx_type_map = mi_params->tx_type_map + mi_grid_idx;
 1681|  15.6M|  xd->tx_type_map_stride = mi_params->mi_stride;
 1682|  15.6M|}
decodeframe.c:get_mi_grid_idx:
 1656|  15.6M|                                  int mi_row, int mi_col) {
 1657|  15.6M|  return mi_row * mi_params->mi_stride + mi_col;
 1658|  15.6M|}
decodeframe.c:get_alloc_mi_idx:
 1661|  15.6M|                                   int mi_row, int mi_col) {
 1662|  15.6M|  const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
 1663|  15.6M|  const int mi_alloc_row = mi_row / mi_alloc_size_1d;
 1664|  15.6M|  const int mi_alloc_col = mi_col / mi_alloc_size_1d;
 1665|       |
 1666|  15.6M|  return mi_alloc_row * mi_params->mi_alloc_stride + mi_alloc_col;
 1667|  15.6M|}
decodeframe.c:set_plane_n4:
 1345|  22.7M|                                const int num_planes) {
 1346|  22.7M|  int i;
 1347|  89.6M|  for (i = 0; i < num_planes; i++) {
  ------------------
  |  Branch (1347:15): [True: 66.8M, False: 22.7M]
  ------------------
 1348|  66.8M|    xd->plane[i].width = (bw * MI_SIZE) >> xd->plane[i].subsampling_x;
  ------------------
  |  |   40|  66.8M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  66.8M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1349|  66.8M|    xd->plane[i].height = (bh * MI_SIZE) >> xd->plane[i].subsampling_y;
  ------------------
  |  |   40|  66.8M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  66.8M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1350|       |
 1351|  66.8M|    xd->plane[i].width = AOMMAX(xd->plane[i].width, 4);
  ------------------
  |  |   35|  66.8M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 49.7M, False: 17.0M]
  |  |  ------------------
  ------------------
 1352|  66.8M|    xd->plane[i].height = AOMMAX(xd->plane[i].height, 4);
  ------------------
  |  |   35|  66.8M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 44.1M, False: 22.7M]
  |  |  ------------------
  ------------------
 1353|  66.8M|  }
 1354|  22.7M|}
decodeframe.c:set_entropy_context:
 1318|  15.6M|                                       const int num_planes) {
 1319|  15.6M|  int i;
 1320|  15.6M|  int row_offset = mi_row;
 1321|  15.6M|  int col_offset = mi_col;
 1322|  61.2M|  for (i = 0; i < num_planes; ++i) {
  ------------------
  |  Branch (1322:15): [True: 45.6M, False: 15.6M]
  ------------------
 1323|  45.6M|    struct macroblockd_plane *const pd = &xd->plane[i];
 1324|       |    // Offset the buffer pointer
 1325|  45.6M|    const BLOCK_SIZE bsize = xd->mi[0]->bsize;
 1326|  45.6M|    if (pd->subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1))
  ------------------
  |  Branch (1326:9): [True: 19.1M, False: 26.4M]
  |  Branch (1326:30): [True: 1.53M, False: 17.6M]
  |  Branch (1326:49): [True: 1.53M, False: 0]
  ------------------
 1327|  1.53M|      row_offset = mi_row - 1;
 1328|  45.6M|    if (pd->subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1))
  ------------------
  |  Branch (1328:9): [True: 19.2M, False: 26.3M]
  |  Branch (1328:30): [True: 1.29M, False: 17.9M]
  |  Branch (1328:49): [True: 1.29M, False: 0]
  ------------------
 1329|  1.29M|      col_offset = mi_col - 1;
 1330|  45.6M|    int above_idx = col_offset;
 1331|  45.6M|    int left_idx = row_offset & MAX_MIB_MASK;
  ------------------
  |  |   50|  45.6M|#define MAX_MIB_MASK (MAX_MIB_SIZE - 1)
  |  |  ------------------
  |  |  |  |   44|  45.6M|#define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   43|  45.6M|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   31|  45.6M|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   39|  45.6M|#define MI_SIZE_LOG2 2
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1332|  45.6M|    pd->above_entropy_context =
 1333|  45.6M|        &xd->above_entropy_context[i][above_idx >> pd->subsampling_x];
 1334|  45.6M|    pd->left_entropy_context =
 1335|  45.6M|        &xd->left_entropy_context[i][left_idx >> pd->subsampling_y];
 1336|  45.6M|  }
 1337|  15.6M|}
decodeframe.c:set_mi_row_col:
 1358|  22.8M|                                  int mi_rows, int mi_cols) {
 1359|  22.8M|  xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
  ------------------
  |  |   29|  22.8M|#define GET_MV_SUBPEL(x) ((x)*8)
  ------------------
 1360|  22.8M|  xd->mb_to_bottom_edge = GET_MV_SUBPEL((mi_rows - bh - mi_row) * MI_SIZE);
  ------------------
  |  |   29|  22.8M|#define GET_MV_SUBPEL(x) ((x)*8)
  ------------------
 1361|  22.8M|  xd->mb_to_left_edge = -GET_MV_SUBPEL((mi_col * MI_SIZE));
  ------------------
  |  |   29|  22.8M|#define GET_MV_SUBPEL(x) ((x)*8)
  ------------------
 1362|  22.8M|  xd->mb_to_right_edge = GET_MV_SUBPEL((mi_cols - bw - mi_col) * MI_SIZE);
  ------------------
  |  |   29|  22.8M|#define GET_MV_SUBPEL(x) ((x)*8)
  ------------------
 1363|       |
 1364|  22.8M|  xd->mi_row = mi_row;
 1365|  22.8M|  xd->mi_col = mi_col;
 1366|       |
 1367|       |  // Are edges available for intra prediction?
 1368|  22.8M|  xd->up_available = (mi_row > tile->mi_row_start);
 1369|       |
 1370|  22.8M|  const int ss_x = xd->plane[1].subsampling_x;
 1371|  22.8M|  const int ss_y = xd->plane[1].subsampling_y;
 1372|       |
 1373|  22.8M|  xd->left_available = (mi_col > tile->mi_col_start);
 1374|  22.8M|  xd->chroma_up_available = xd->up_available;
 1375|  22.8M|  xd->chroma_left_available = xd->left_available;
 1376|  22.8M|  if (ss_x && bw < mi_size_wide[BLOCK_8X8])
  ------------------
  |  Branch (1376:7): [True: 17.3M, False: 5.47M]
  |  Branch (1376:15): [True: 2.29M, False: 15.0M]
  ------------------
 1377|  2.29M|    xd->chroma_left_available = (mi_col - 1) > tile->mi_col_start;
 1378|  22.8M|  if (ss_y && bh < mi_size_high[BLOCK_8X8])
  ------------------
  |  Branch (1378:7): [True: 17.3M, False: 5.49M]
  |  Branch (1378:15): [True: 2.71M, False: 14.5M]
  ------------------
 1379|  2.71M|    xd->chroma_up_available = (mi_row - 1) > tile->mi_row_start;
 1380|  22.8M|  if (xd->up_available) {
  ------------------
  |  Branch (1380:7): [True: 21.9M, False: 886k]
  ------------------
 1381|  21.9M|    xd->above_mbmi = xd->mi[-xd->mi_stride];
 1382|  21.9M|  } else {
 1383|   886k|    xd->above_mbmi = NULL;
 1384|   886k|  }
 1385|       |
 1386|  22.8M|  if (xd->left_available) {
  ------------------
  |  Branch (1386:7): [True: 22.1M, False: 678k]
  ------------------
 1387|  22.1M|    xd->left_mbmi = xd->mi[-1];
 1388|  22.1M|  } else {
 1389|   678k|    xd->left_mbmi = NULL;
 1390|   678k|  }
 1391|       |
 1392|  22.8M|  const int chroma_ref = ((mi_row & 0x01) || !(bh & 0x01) || !ss_y) &&
  ------------------
  |  Branch (1392:27): [True: 1.92M, False: 20.8M]
  |  Branch (1392:46): [True: 18.9M, False: 1.93M]
  |  Branch (1392:62): [True: 578k, False: 1.35M]
  ------------------
 1393|  22.8M|                         ((mi_col & 0x01) || !(bw & 0x01) || !ss_x);
  ------------------
  |  Branch (1393:27): [True: 1.37M, False: 20.0M]
  |  Branch (1393:46): [True: 18.6M, False: 1.38M]
  |  Branch (1393:62): [True: 407k, False: 975k]
  ------------------
 1394|  22.8M|  xd->is_chroma_ref = chroma_ref;
 1395|  22.8M|  if (chroma_ref) {
  ------------------
  |  Branch (1395:7): [True: 20.4M, False: 2.32M]
  ------------------
 1396|       |    // To help calculate the "above" and "left" chroma blocks, note that the
 1397|       |    // current block may cover multiple luma blocks (e.g., if partitioned into
 1398|       |    // 4x4 luma blocks).
 1399|       |    // First, find the top-left-most luma block covered by this chroma block
 1400|  20.4M|    MB_MODE_INFO **base_mi =
 1401|  20.4M|        &xd->mi[-(mi_row & ss_y) * xd->mi_stride - (mi_col & ss_x)];
 1402|       |
 1403|       |    // Then, we consider the luma region covered by the left or above 4x4 chroma
 1404|       |    // prediction. We want to point to the chroma reference block in that
 1405|       |    // region, which is the bottom-right-most mi unit.
 1406|       |    // This leads to the following offsets:
 1407|  20.4M|    MB_MODE_INFO *chroma_above_mi =
 1408|  20.4M|        xd->chroma_up_available ? base_mi[-xd->mi_stride + ss_x] : NULL;
  ------------------
  |  Branch (1408:9): [True: 19.6M, False: 857k]
  ------------------
 1409|  20.4M|    xd->chroma_above_mbmi = chroma_above_mi;
 1410|       |
 1411|  20.4M|    MB_MODE_INFO *chroma_left_mi =
 1412|  20.4M|        xd->chroma_left_available ? base_mi[ss_y * xd->mi_stride - 1] : NULL;
  ------------------
  |  Branch (1412:9): [True: 19.8M, False: 673k]
  ------------------
 1413|  20.4M|    xd->chroma_left_mbmi = chroma_left_mi;
 1414|  20.4M|  }
 1415|       |
 1416|  22.8M|  xd->height = bh;
 1417|  22.8M|  xd->width = bw;
 1418|       |
 1419|  22.8M|  xd->is_last_vertical_rect = 0;
 1420|  22.8M|  if (xd->width < xd->height) {
  ------------------
  |  Branch (1420:7): [True: 5.18M, False: 17.6M]
  ------------------
 1421|  5.18M|    if (!((mi_col + xd->width) & (xd->height - 1))) {
  ------------------
  |  Branch (1421:9): [True: 1.88M, False: 3.29M]
  ------------------
 1422|  1.88M|      xd->is_last_vertical_rect = 1;
 1423|  1.88M|    }
 1424|  5.18M|  }
 1425|       |
 1426|  22.8M|  xd->is_first_horizontal_rect = 0;
 1427|  22.8M|  if (xd->width > xd->height)
  ------------------
  |  Branch (1427:7): [True: 8.60M, False: 14.1M]
  ------------------
 1428|  8.60M|    if (!(mi_row & (xd->width - 1))) xd->is_first_horizontal_rect = 1;
  ------------------
  |  Branch (1428:9): [True: 3.25M, False: 5.35M]
  ------------------
 1429|  22.8M|}
decodeframe.c:max_block_high:
 1580|  41.3M|                                 int plane) {
 1581|  41.3M|  int max_blocks_high = block_size_high[bsize];
 1582|       |
 1583|  41.3M|  if (xd->mb_to_bottom_edge < 0) {
  ------------------
  |  Branch (1583:7): [True: 311k, False: 41.0M]
  ------------------
 1584|   311k|    const struct macroblockd_plane *const pd = &xd->plane[plane];
 1585|   311k|    max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
 1586|   311k|  }
 1587|       |
 1588|       |  // Scale the height in the transform block unit.
 1589|  41.3M|  return max_blocks_high >> MI_SIZE_LOG2;
  ------------------
  |  |   39|  41.3M|#define MI_SIZE_LOG2 2
  ------------------
 1590|  41.3M|}
decodeframe.c:max_block_wide:
 1566|  41.3M|                                 int plane) {
 1567|  41.3M|  assert(bsize < BLOCK_SIZES_ALL);
 1568|  41.3M|  int max_blocks_wide = block_size_wide[bsize];
 1569|       |
 1570|  41.3M|  if (xd->mb_to_right_edge < 0) {
  ------------------
  |  Branch (1570:7): [True: 289k, False: 41.0M]
  ------------------
 1571|   289k|    const struct macroblockd_plane *const pd = &xd->plane[plane];
 1572|   289k|    max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
 1573|   289k|  }
 1574|       |
 1575|       |  // Scale the width in the transform block unit.
 1576|  41.3M|  return max_blocks_wide >> MI_SIZE_LOG2;
  ------------------
  |  |   39|  41.3M|#define MI_SIZE_LOG2 2
  ------------------
 1577|  41.3M|}
decodeframe.c:txfm_partition_update:
 1686|  1.43M|                                         TX_SIZE tx_size, TX_SIZE txb_size) {
 1687|  1.43M|  BLOCK_SIZE bsize = txsize_to_bsize[txb_size];
 1688|  1.43M|  int bh = mi_size_high[bsize];
 1689|  1.43M|  int bw = mi_size_wide[bsize];
 1690|  1.43M|  uint8_t txw = tx_size_wide[tx_size];
 1691|  1.43M|  uint8_t txh = tx_size_high[tx_size];
 1692|  1.43M|  int i;
 1693|  6.08M|  for (i = 0; i < bh; ++i) left_ctx[i] = txh;
  ------------------
  |  Branch (1693:15): [True: 4.65M, False: 1.43M]
  ------------------
 1694|  6.44M|  for (i = 0; i < bw; ++i) above_ctx[i] = txw;
  ------------------
  |  Branch (1694:15): [True: 5.01M, False: 1.43M]
  ------------------
 1695|  1.43M|}
decodeframe.c:txfm_partition_context:
 1749|  1.49M|                                         BLOCK_SIZE bsize, TX_SIZE tx_size) {
 1750|  1.49M|  const uint8_t txw = tx_size_wide[tx_size];
 1751|  1.49M|  const uint8_t txh = tx_size_high[tx_size];
 1752|  1.49M|  const int above = *above_ctx < txw;
 1753|  1.49M|  const int left = *left_ctx < txh;
 1754|  1.49M|  int category = TXFM_PARTITION_CONTEXTS;
  ------------------
  |  |  521|  1.49M|#define TXFM_PARTITION_CONTEXTS ((TX_SIZES - TX_8X8) * 6 - 3)
  ------------------
 1755|       |
 1756|       |  // dummy return, not used by others.
 1757|  1.49M|  if (tx_size <= TX_4X4) return 0;
  ------------------
  |  Branch (1757:7): [True: 0, False: 1.49M]
  ------------------
 1758|       |
 1759|  1.49M|  TX_SIZE max_tx_size =
 1760|  1.49M|      get_sqr_tx_size(AOMMAX(block_size_wide[bsize], block_size_high[bsize]));
  ------------------
  |  |   35|  1.49M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 496k, False: 1.00M]
  |  |  ------------------
  ------------------
 1761|       |
 1762|  1.49M|  if (max_tx_size >= TX_8X8) {
  ------------------
  |  Branch (1762:7): [True: 1.49M, False: 420]
  ------------------
 1763|  1.49M|    category =
 1764|  1.49M|        (txsize_sqr_up_map[tx_size] != max_tx_size && max_tx_size > TX_8X8) +
  ------------------
  |  Branch (1764:10): [True: 513k, False: 983k]
  |  Branch (1764:55): [True: 513k, False: 18.4E]
  ------------------
 1765|  1.49M|        (TX_SIZES - 1 - max_tx_size) * 2;
 1766|  1.49M|  }
 1767|  1.49M|  assert(category != TXFM_PARTITION_CONTEXTS);
 1768|  1.49M|  return category * 3 + above + left;
 1769|  1.49M|}
decodeframe.c:get_sqr_tx_size:
 1697|  1.49M|static inline TX_SIZE get_sqr_tx_size(int tx_dim) {
 1698|  1.49M|  switch (tx_dim) {
 1699|  52.3k|    case 128:
  ------------------
  |  Branch (1699:5): [True: 52.3k, False: 1.44M]
  ------------------
 1700|   112k|    case 64: return TX_64X64; break;
  ------------------
  |  Branch (1700:5): [True: 60.4k, False: 1.43M]
  ------------------
 1701|   386k|    case 32: return TX_32X32; break;
  ------------------
  |  Branch (1701:5): [True: 386k, False: 1.11M]
  ------------------
 1702|   670k|    case 16: return TX_16X16; break;
  ------------------
  |  Branch (1702:5): [True: 670k, False: 827k]
  ------------------
 1703|   328k|    case 8: return TX_8X8; break;
  ------------------
  |  Branch (1703:5): [True: 328k, False: 1.16M]
  ------------------
 1704|      0|    default: return TX_4X4;
  ------------------
  |  Branch (1704:5): [True: 0, False: 1.49M]
  ------------------
 1705|  1.49M|  }
 1706|  1.49M|}
decodeframe.c:set_txfm_ctxs:
 1642|  14.6M|                                 const MACROBLOCKD *xd) {
 1643|  14.6M|  uint8_t bw = tx_size_wide[tx_size];
 1644|  14.6M|  uint8_t bh = tx_size_high[tx_size];
 1645|       |
 1646|  14.6M|  if (skip) {
  ------------------
  |  Branch (1646:7): [True: 1.20M, False: 13.4M]
  ------------------
 1647|  1.20M|    bw = n4_w * MI_SIZE;
  ------------------
  |  |   40|  1.20M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  1.20M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1648|  1.20M|    bh = n4_h * MI_SIZE;
  ------------------
  |  |   40|  1.20M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  1.20M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1649|  1.20M|  }
 1650|       |
 1651|  14.6M|  set_txfm_ctx(xd->above_txfm_context, bw, n4_w);
 1652|  14.6M|  set_txfm_ctx(xd->left_txfm_context, bh, n4_h);
 1653|  14.6M|}
decodeframe.c:set_txfm_ctx:
 1636|  29.2M|static inline void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) {
 1637|  29.2M|  int i;
 1638|   174M|  for (i = 0; i < len; ++i) txfm_ctx[i] = txs;
  ------------------
  |  Branch (1638:15): [True: 145M, False: 29.2M]
  ------------------
 1639|  29.2M|}
decodeframe.c:partition_plane_context:
 1540|  10.5M|                                          int mi_col, BLOCK_SIZE bsize) {
 1541|  10.5M|  const PARTITION_CONTEXT *above_ctx = xd->above_partition_context + mi_col;
 1542|  10.5M|  const PARTITION_CONTEXT *left_ctx =
 1543|  10.5M|      xd->left_partition_context + (mi_row & MAX_MIB_MASK);
  ------------------
  |  |   50|  10.5M|#define MAX_MIB_MASK (MAX_MIB_SIZE - 1)
  |  |  ------------------
  |  |  |  |   44|  10.5M|#define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   43|  10.5M|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   31|  10.5M|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   39|  10.5M|#define MI_SIZE_LOG2 2
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1544|       |  // Minimum partition point is 8x8. Offset the bsl accordingly.
 1545|  10.5M|  const int bsl = mi_size_wide_log2[bsize] - mi_size_wide_log2[BLOCK_8X8];
 1546|  10.5M|  int above = (*above_ctx >> bsl) & 1, left = (*left_ctx >> bsl) & 1;
 1547|       |
 1548|  10.5M|  assert(mi_size_wide_log2[bsize] == mi_size_high_log2[bsize]);
 1549|  10.5M|  assert(bsl >= 0);
 1550|       |
 1551|  10.5M|  return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
  ------------------
  |  |  169|  10.5M|#define PARTITION_PLOFFSET 4  // number of probability models per block size
  ------------------
 1552|  10.5M|}
decodeframe.c:partition_cdf_length:
 1556|  10.1M|static inline int partition_cdf_length(BLOCK_SIZE bsize) {
 1557|  10.1M|  if (bsize <= BLOCK_8X8)
  ------------------
  |  Branch (1557:7): [True: 2.10M, False: 8.05M]
  ------------------
 1558|  2.10M|    return PARTITION_TYPES;
 1559|  8.05M|  else if (bsize == BLOCK_128X128)
  ------------------
  |  Branch (1559:12): [True: 512k, False: 7.53M]
  ------------------
 1560|   512k|    return EXT_PARTITION_TYPES - 2;
 1561|  7.53M|  else
 1562|  7.53M|    return EXT_PARTITION_TYPES;
 1563|  10.1M|}
decodeframe.c:partition_gather_vert_alike:
 1487|   119k|                                               BLOCK_SIZE bsize) {
 1488|   119k|  (void)bsize;
 1489|   119k|  out[0] = CDF_PROB_TOP;
  ------------------
  |  |   33|   119k|#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
  |  |  ------------------
  |  |  |  |   32|   119k|#define CDF_PROB_BITS 15
  |  |  ------------------
  ------------------
 1490|   119k|  out[0] -= cdf_element_prob(in, PARTITION_VERT);
 1491|   119k|  out[0] -= cdf_element_prob(in, PARTITION_SPLIT);
 1492|   119k|  out[0] -= cdf_element_prob(in, PARTITION_HORZ_A);
 1493|   119k|  out[0] -= cdf_element_prob(in, PARTITION_VERT_A);
 1494|   119k|  out[0] -= cdf_element_prob(in, PARTITION_VERT_B);
 1495|   119k|  if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_VERT_4);
  ------------------
  |  Branch (1495:7): [True: 69.3k, False: 49.6k]
  ------------------
 1496|   119k|  out[0] = AOM_ICDF(out[0]);
  ------------------
  |  |   38|   119k|#define AOM_ICDF(x) (CDF_PROB_TOP - (x))
  |  |  ------------------
  |  |  |  |   33|   119k|#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   32|   119k|#define CDF_PROB_BITS 15
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1497|   119k|  out[1] = AOM_ICDF(CDF_PROB_TOP);
  ------------------
  |  |   38|   119k|#define AOM_ICDF(x) (CDF_PROB_TOP - (x))
  |  |  ------------------
  |  |  |  |   33|   119k|#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   32|   119k|#define CDF_PROB_BITS 15
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1498|   119k|}
decodeframe.c:cdf_element_prob:
 1465|  1.81M|                                            size_t element) {
 1466|  1.81M|  assert(cdf != NULL);
 1467|  18.4E|  return (element > 0 ? cdf[element - 1] : CDF_PROB_TOP) - cdf[element];
  ------------------
  |  |   33|  18.4E|#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
  |  |  ------------------
  |  |  |  |   32|  18.4E|#define CDF_PROB_BITS 15
  |  |  ------------------
  ------------------
  |  Branch (1467:11): [True: 1.81M, False: 18.4E]
  ------------------
 1468|  1.81M|}
decodeframe.c:partition_gather_horz_alike:
 1472|   203k|                                               BLOCK_SIZE bsize) {
 1473|   203k|  (void)bsize;
 1474|   203k|  out[0] = CDF_PROB_TOP;
  ------------------
  |  |   33|   203k|#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
  |  |  ------------------
  |  |  |  |   32|   203k|#define CDF_PROB_BITS 15
  |  |  ------------------
  ------------------
 1475|   203k|  out[0] -= cdf_element_prob(in, PARTITION_HORZ);
 1476|   203k|  out[0] -= cdf_element_prob(in, PARTITION_SPLIT);
 1477|   203k|  out[0] -= cdf_element_prob(in, PARTITION_HORZ_A);
 1478|   203k|  out[0] -= cdf_element_prob(in, PARTITION_HORZ_B);
 1479|   203k|  out[0] -= cdf_element_prob(in, PARTITION_VERT_A);
 1480|   203k|  if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_HORZ_4);
  ------------------
  |  Branch (1480:7): [True: 133k, False: 69.6k]
  ------------------
 1481|   203k|  out[0] = AOM_ICDF(out[0]);
  ------------------
  |  |   38|   203k|#define AOM_ICDF(x) (CDF_PROB_TOP - (x))
  |  |  ------------------
  |  |  |  |   33|   203k|#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   32|   203k|#define CDF_PROB_BITS 15
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1482|   203k|  out[1] = AOM_ICDF(CDF_PROB_TOP);
  ------------------
  |  |   38|   203k|#define AOM_ICDF(x) (CDF_PROB_TOP - (x))
  |  |  ------------------
  |  |  |  |   33|   203k|#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   32|   203k|#define CDF_PROB_BITS 15
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1483|   203k|}
decodeframe.c:get_partition:
 1775|  5.28M|                                           BLOCK_SIZE bsize) {
 1776|  5.28M|  const CommonModeInfoParams *const mi_params = &cm->mi_params;
 1777|  5.28M|  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols)
  ------------------
  |  Branch (1777:7): [True: 9, False: 5.28M]
  |  Branch (1777:39): [True: 0, False: 5.28M]
  ------------------
 1778|      0|    return PARTITION_INVALID;
 1779|       |
 1780|  5.28M|  const int offset = mi_row * mi_params->mi_stride + mi_col;
 1781|  5.28M|  MB_MODE_INFO **mi = mi_params->mi_grid_base + offset;
 1782|  5.28M|  const BLOCK_SIZE subsize = mi[0]->bsize;
 1783|       |
 1784|  5.28M|  assert(bsize < BLOCK_SIZES_ALL);
 1785|       |
 1786|  5.28M|  if (subsize == bsize) return PARTITION_NONE;
  ------------------
  |  Branch (1786:7): [True: 2.19M, False: 3.09M]
  ------------------
 1787|       |
 1788|  3.09M|  const int bhigh = mi_size_high[bsize];
 1789|  3.09M|  const int bwide = mi_size_wide[bsize];
 1790|  3.09M|  const int sshigh = mi_size_high[subsize];
 1791|  3.09M|  const int sswide = mi_size_wide[subsize];
 1792|       |
 1793|  3.09M|  if (bsize > BLOCK_8X8 && mi_row + bwide / 2 < mi_params->mi_rows &&
  ------------------
  |  Branch (1793:7): [True: 2.70M, False: 384k]
  |  Branch (1793:28): [True: 2.62M, False: 83.6k]
  ------------------
 1794|  3.09M|      mi_col + bhigh / 2 < mi_params->mi_cols) {
  ------------------
  |  Branch (1794:7): [True: 2.56M, False: 57.6k]
  ------------------
 1795|       |    // In this case, the block might be using an extended partition
 1796|       |    // type.
 1797|  2.56M|    const MB_MODE_INFO *const mbmi_right = mi[bwide / 2];
 1798|  2.56M|    const MB_MODE_INFO *const mbmi_below = mi[bhigh / 2 * mi_params->mi_stride];
 1799|       |
 1800|  2.56M|    if (sswide == bwide) {
  ------------------
  |  Branch (1800:9): [True: 864k, False: 1.70M]
  ------------------
 1801|       |      // Smaller height but same width. Is PARTITION_HORZ_4, PARTITION_HORZ or
 1802|       |      // PARTITION_HORZ_B. To distinguish the latter two, check if the lower
 1803|       |      // half was split.
 1804|   864k|      if (sshigh * 4 == bhigh) return PARTITION_HORZ_4;
  ------------------
  |  Branch (1804:11): [True: 408k, False: 456k]
  ------------------
 1805|   456k|      assert(sshigh * 2 == bhigh);
 1806|       |
 1807|   456k|      if (mbmi_below->bsize == subsize)
  ------------------
  |  Branch (1807:11): [True: 346k, False: 110k]
  ------------------
 1808|   346k|        return PARTITION_HORZ;
 1809|   110k|      else
 1810|   110k|        return PARTITION_HORZ_B;
 1811|  1.70M|    } else if (sshigh == bhigh) {
  ------------------
  |  Branch (1811:16): [True: 391k, False: 1.30M]
  ------------------
 1812|       |      // Smaller width but same height. Is PARTITION_VERT_4, PARTITION_VERT or
 1813|       |      // PARTITION_VERT_B. To distinguish the latter two, check if the right
 1814|       |      // half was split.
 1815|   391k|      if (sswide * 4 == bwide) return PARTITION_VERT_4;
  ------------------
  |  Branch (1815:11): [True: 134k, False: 257k]
  ------------------
 1816|   257k|      assert(sswide * 2 == bwide);
 1817|       |
 1818|   257k|      if (mbmi_right->bsize == subsize)
  ------------------
  |  Branch (1818:11): [True: 178k, False: 78.6k]
  ------------------
 1819|   178k|        return PARTITION_VERT;
 1820|  78.6k|      else
 1821|  78.6k|        return PARTITION_VERT_B;
 1822|  1.30M|    } else {
 1823|       |      // Smaller width and smaller height. Might be PARTITION_SPLIT or could be
 1824|       |      // PARTITION_HORZ_A or PARTITION_VERT_A. If subsize isn't halved in both
 1825|       |      // dimensions, we immediately know this is a split (which will recurse to
 1826|       |      // get to subsize). Otherwise look down and to the right. With
 1827|       |      // PARTITION_VERT_A, the right block will have height bhigh; with
 1828|       |      // PARTITION_HORZ_A, the lower block with have width bwide. Otherwise
 1829|       |      // it's PARTITION_SPLIT.
 1830|  1.30M|      if (sswide * 2 != bwide || sshigh * 2 != bhigh) return PARTITION_SPLIT;
  ------------------
  |  Branch (1830:11): [True: 421k, False: 887k]
  |  Branch (1830:34): [True: 254k, False: 633k]
  ------------------
 1831|       |
 1832|   632k|      if (mi_size_wide[mbmi_below->bsize] == bwide) return PARTITION_HORZ_A;
  ------------------
  |  Branch (1832:11): [True: 121k, False: 511k]
  ------------------
 1833|   511k|      if (mi_size_high[mbmi_right->bsize] == bhigh) return PARTITION_VERT_A;
  ------------------
  |  Branch (1833:11): [True: 64.8k, False: 446k]
  ------------------
 1834|       |
 1835|   446k|      return PARTITION_SPLIT;
 1836|   511k|    }
 1837|  2.56M|  }
 1838|   525k|  const int vert_split = sswide < bwide;
 1839|   525k|  const int horz_split = sshigh < bhigh;
 1840|   525k|  const int split_idx = (vert_split << 1) | horz_split;
 1841|   525k|  assert(split_idx != 0);
 1842|       |
 1843|   526k|  static const PARTITION_TYPE base_partitions[4] = {
 1844|   526k|    PARTITION_INVALID, PARTITION_HORZ, PARTITION_VERT, PARTITION_SPLIT
 1845|   526k|  };
 1846|       |
 1847|   526k|  return base_partitions[split_idx];
 1848|   525k|}
decodeframe.c:update_ext_partition_context:
 1503|  11.3M|                                                PARTITION_TYPE partition) {
 1504|  11.3M|  if (bsize >= BLOCK_8X8) {
  ------------------
  |  Branch (1504:7): [True: 10.5M, False: 750k]
  ------------------
 1505|  10.5M|    const int hbs = mi_size_wide[bsize] / 2;
 1506|  10.5M|    BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
 1507|  10.5M|    switch (partition) {
 1508|  2.67M|      case PARTITION_SPLIT:
  ------------------
  |  Branch (1508:7): [True: 2.67M, False: 7.87M]
  ------------------
 1509|  2.67M|        if (bsize != BLOCK_8X8) break;
  ------------------
  |  Branch (1509:13): [True: 2.48M, False: 187k]
  ------------------
 1510|   187k|        AOM_FALLTHROUGH_INTENDED;
  ------------------
  |  |   52|   187k|  do {                           \
  |  |   53|   187k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (53:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1511|  3.97M|      case PARTITION_NONE:
  ------------------
  |  Branch (1511:7): [True: 3.79M, False: 6.75M]
  ------------------
 1512|  5.26M|      case PARTITION_HORZ:
  ------------------
  |  Branch (1512:7): [True: 1.28M, False: 9.26M]
  ------------------
 1513|  6.12M|      case PARTITION_VERT:
  ------------------
  |  Branch (1513:7): [True: 861k, False: 9.68M]
  ------------------
 1514|  6.80M|      case PARTITION_HORZ_4:
  ------------------
  |  Branch (1514:7): [True: 679k, False: 9.86M]
  ------------------
 1515|  7.27M|      case PARTITION_VERT_4:
  ------------------
  |  Branch (1515:7): [True: 465k, False: 10.0M]
  ------------------
 1516|  7.27M|        update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 1517|  7.27M|        break;
 1518|   242k|      case PARTITION_HORZ_A:
  ------------------
  |  Branch (1518:7): [True: 242k, False: 10.3M]
  ------------------
 1519|   242k|        update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
 1520|   242k|        update_partition_context(xd, mi_row + hbs, mi_col, subsize, subsize);
 1521|   242k|        break;
 1522|   215k|      case PARTITION_HORZ_B:
  ------------------
  |  Branch (1522:7): [True: 215k, False: 10.3M]
  ------------------
 1523|   215k|        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
 1524|   215k|        update_partition_context(xd, mi_row + hbs, mi_col, bsize2, subsize);
 1525|   215k|        break;
 1526|   155k|      case PARTITION_VERT_A:
  ------------------
  |  Branch (1526:7): [True: 155k, False: 10.3M]
  ------------------
 1527|   155k|        update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
 1528|   155k|        update_partition_context(xd, mi_row, mi_col + hbs, subsize, subsize);
 1529|   155k|        break;
 1530|   175k|      case PARTITION_VERT_B:
  ------------------
  |  Branch (1530:7): [True: 175k, False: 10.3M]
  ------------------
 1531|   175k|        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
 1532|   175k|        update_partition_context(xd, mi_row, mi_col + hbs, bsize2, subsize);
 1533|   175k|        break;
 1534|      0|      default: assert(0 && "Invalid partition type");
  ------------------
  |  Branch (1534:7): [True: 0, False: 10.5M]
  ------------------
 1535|  10.5M|    }
 1536|  10.5M|  }
 1537|  11.3M|}
decodeframe.c:update_partition_context:
 1443|  8.85M|                                            BLOCK_SIZE bsize) {
 1444|  8.85M|  PARTITION_CONTEXT *const above_ctx = xd->above_partition_context + mi_col;
 1445|  8.85M|  PARTITION_CONTEXT *const left_ctx =
 1446|  8.85M|      xd->left_partition_context + (mi_row & MAX_MIB_MASK);
  ------------------
  |  |   50|  8.85M|#define MAX_MIB_MASK (MAX_MIB_SIZE - 1)
  |  |  ------------------
  |  |  |  |   44|  8.85M|#define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   43|  8.85M|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   31|  8.85M|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   39|  8.85M|#define MI_SIZE_LOG2 2
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1447|       |
 1448|  8.85M|  const int bw = mi_size_wide[bsize];
 1449|  8.85M|  const int bh = mi_size_high[bsize];
 1450|  8.85M|  memset(above_ctx, partition_context_lookup[subsize].above, bw);
 1451|  8.85M|  memset(left_ctx, partition_context_lookup[subsize].left, bh);
 1452|  8.85M|}
decodeframe.c:av1_init_macroblockd:
 1285|   288k|static inline void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd) {
 1286|   288k|  const int num_planes = av1_num_planes(cm);
 1287|   288k|  const CommonQuantParams *const quant_params = &cm->quant_params;
 1288|       |
 1289|  1.08M|  for (int i = 0; i < num_planes; ++i) {
  ------------------
  |  Branch (1289:19): [True: 794k, False: 288k]
  ------------------
 1290|   794k|    if (xd->plane[i].plane_type == PLANE_TYPE_Y) {
  ------------------
  |  Branch (1290:9): [True: 288k, False: 506k]
  ------------------
 1291|   288k|      memcpy(xd->plane[i].seg_dequant_QTX, quant_params->y_dequant_QTX,
 1292|   288k|             sizeof(quant_params->y_dequant_QTX));
 1293|   288k|      memcpy(xd->plane[i].seg_iqmatrix, quant_params->y_iqmatrix,
 1294|   288k|             sizeof(quant_params->y_iqmatrix));
 1295|       |
 1296|   506k|    } else {
 1297|   506k|      if (i == AOM_PLANE_U) {
  ------------------
  |  |  227|   506k|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
  |  Branch (1297:11): [True: 254k, False: 251k]
  ------------------
 1298|   254k|        memcpy(xd->plane[i].seg_dequant_QTX, quant_params->u_dequant_QTX,
 1299|   254k|               sizeof(quant_params->u_dequant_QTX));
 1300|   254k|        memcpy(xd->plane[i].seg_iqmatrix, quant_params->u_iqmatrix,
 1301|   254k|               sizeof(quant_params->u_iqmatrix));
 1302|   254k|      } else {
 1303|   251k|        memcpy(xd->plane[i].seg_dequant_QTX, quant_params->v_dequant_QTX,
 1304|   251k|               sizeof(quant_params->v_dequant_QTX));
 1305|   251k|        memcpy(xd->plane[i].seg_iqmatrix, quant_params->v_iqmatrix,
 1306|   251k|               sizeof(quant_params->v_iqmatrix));
 1307|   251k|      }
 1308|   506k|    }
 1309|   794k|  }
 1310|   288k|  xd->mi_stride = cm->mi_params.mi_stride;
 1311|   288k|  xd->error_info = cm->error;
 1312|   288k|#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
 1313|   288k|  cfl_init(&xd->cfl, cm->seq_params);
 1314|   288k|#endif
 1315|   288k|}
decodemv.c:frame_is_intra_only:
 1174|  24.8M|static inline int frame_is_intra_only(const AV1_COMMON *const cm) {
 1175|  24.8M|  return cm->current_frame.frame_type == KEY_FRAME ||
  ------------------
  |  Branch (1175:10): [True: 18.3M, False: 6.52M]
  ------------------
 1176|  24.8M|         cm->current_frame.frame_type == INTRA_ONLY_FRAME;
  ------------------
  |  Branch (1176:10): [True: 162k, False: 6.36M]
  ------------------
 1177|  24.8M|}
decodemv.c:get_mi_grid_idx:
 1656|   909k|                                  int mi_row, int mi_col) {
 1657|   909k|  return mi_row * mi_params->mi_stride + mi_col;
 1658|   909k|}
decodemv.c:av1_num_planes:
 1271|  2.66M|static inline int av1_num_planes(const AV1_COMMON *cm) {
 1272|  2.66M|  return cm->seq_params->monochrome ? 1 : MAX_MB_PLANE;
  ------------------
  |  |   36|  2.60M|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (1272:10): [True: 58.3k, False: 2.60M]
  ------------------
 1273|  2.66M|}
decodemv.c:get_y_mode_cdf:
 1433|  9.19M|                                           const MB_MODE_INFO *left_mi) {
 1434|  9.19M|  const PREDICTION_MODE above = av1_above_block_mode(above_mi);
 1435|  9.19M|  const PREDICTION_MODE left = av1_left_block_mode(left_mi);
 1436|  9.19M|  const int above_ctx = intra_mode_context[above];
 1437|  9.19M|  const int left_ctx = intra_mode_context[left];
 1438|  9.19M|  return tile_ctx->kf_y_cdf[above_ctx][left_ctx];
 1439|  9.19M|}
decodemv.c:get_ref_scale_factors_const:
 1202|  5.18M|    const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
 1203|  5.18M|  const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
 1204|  5.18M|  return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL;
  ------------------
  |  |   15|  5.18M|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  |  Branch (1204:10): [True: 5.18M, False: 18.4E]
  ------------------
 1205|  5.18M|}
decodemv.c:get_ref_frame_map_idx:
 1187|  5.85M|                                        const MV_REFERENCE_FRAME ref_frame) {
 1188|  5.85M|  return (ref_frame >= LAST_FRAME && ref_frame <= EXTREF_FRAME)
  ------------------
  |  Branch (1188:11): [True: 5.85M, False: 18.4E]
  |  Branch (1188:38): [True: 5.85M, False: 18.4E]
  ------------------
 1189|  5.85M|             ? cm->remapped_ref_idx[ref_frame - LAST_FRAME]
 1190|  18.4E|             : INVALID_IDX;
  ------------------
  |  |   15|  18.4E|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
 1191|  5.85M|}
decodemv.c:get_ref_frame_buf:
 1194|   673k|    const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
 1195|   673k|  const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
 1196|   674k|  return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL;
  ------------------
  |  |   15|   673k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  |  Branch (1196:10): [True: 674k, False: 18.4E]
  ------------------
 1197|   673k|}
decoder.c:calc_mi_size:
 1339|   367k|static inline int calc_mi_size(int len) {
 1340|       |  // len is in mi units. Align to a multiple of SBs.
 1341|   367k|  return ALIGN_POWER_OF_TWO(len, MAX_MIB_SIZE_LOG2);
  ------------------
  |  |   69|   367k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
 1342|   367k|}
decoder.c:av1_num_planes:
 1271|  78.6M|static inline int av1_num_planes(const AV1_COMMON *cm) {
 1272|  78.6M|  return cm->seq_params->monochrome ? 1 : MAX_MB_PLANE;
  ------------------
  |  |   36|  73.6M|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (1272:10): [True: 5.02M, False: 73.6M]
  ------------------
 1273|  78.6M|}
decoder.c:get_ref_frame:
 1098|    357|static inline YV12_BUFFER_CONFIG *get_ref_frame(AV1_COMMON *cm, int index) {
 1099|    357|  if (index < 0 || index >= REF_FRAMES) return NULL;
  ------------------
  |  Branch (1099:7): [True: 133, False: 224]
  |  Branch (1099:20): [True: 0, False: 224]
  ------------------
 1100|    224|  if (cm->ref_frame_map[index] == NULL) return NULL;
  ------------------
  |  Branch (1100:7): [True: 183, False: 41]
  ------------------
 1101|     41|  return &cm->ref_frame_map[index]->buf;
 1102|    224|}
decoder.c:get_ref_frame_buf:
 1194|    362|    const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
 1195|    362|  const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
 1196|    362|  return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL;
  ------------------
  |  |   15|    362|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  |  Branch (1196:10): [True: 14, False: 348]
  ------------------
 1197|    362|}
decoder.c:get_ref_frame_map_idx:
 1187|    362|                                        const MV_REFERENCE_FRAME ref_frame) {
 1188|    362|  return (ref_frame >= LAST_FRAME && ref_frame <= EXTREF_FRAME)
  ------------------
  |  Branch (1188:11): [True: 362, False: 0]
  |  Branch (1188:38): [True: 362, False: 0]
  ------------------
 1189|    362|             ? cm->remapped_ref_idx[ref_frame - LAST_FRAME]
 1190|    362|             : INVALID_IDX;
  ------------------
  |  |   15|      0|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
 1191|    362|}
decoder.c:assign_cur_frame_new_fb:
 1138|   311k|static inline RefCntBuffer *assign_cur_frame_new_fb(AV1_COMMON *const cm) {
 1139|       |  // Release the previously-used frame-buffer
 1140|   311k|  if (cm->cur_frame != NULL) {
  ------------------
  |  Branch (1140:7): [True: 0, False: 311k]
  ------------------
 1141|      0|    --cm->cur_frame->ref_count;
 1142|      0|    cm->cur_frame = NULL;
 1143|      0|  }
 1144|       |
 1145|       |  // Assign a new framebuffer
 1146|   311k|  const int new_fb_idx = get_free_fb(cm);
 1147|   311k|  if (new_fb_idx == INVALID_IDX) return NULL;
  ------------------
  |  |   15|   311k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  |  Branch (1147:7): [True: 0, False: 311k]
  ------------------
 1148|       |
 1149|   311k|  cm->cur_frame = &cm->buffer_pool->frame_bufs[new_fb_idx];
 1150|       |#if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
 1151|       |  aom_invalidate_pyramid(cm->cur_frame->buf.y_pyramid);
 1152|       |  av1_invalidate_corner_list(cm->cur_frame->buf.corners);
 1153|       |#endif  // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
 1154|   311k|  av1_zero(cm->cur_frame->interp_filter_selected);
  ------------------
  |  |   43|   311k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
 1155|   311k|  return cm->cur_frame;
 1156|   311k|}
decoder.c:get_free_fb:
 1104|   311k|static inline int get_free_fb(AV1_COMMON *cm) {
 1105|   311k|  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
 1106|   311k|  int i;
 1107|       |
 1108|   311k|  lock_buffer_pool(cm->buffer_pool);
 1109|   311k|  const int num_frame_bufs = cm->buffer_pool->num_frame_bufs;
 1110|   570k|  for (i = 0; i < num_frame_bufs; ++i)
  ------------------
  |  Branch (1110:15): [True: 570k, False: 0]
  ------------------
 1111|   570k|    if (frame_bufs[i].ref_count == 0) break;
  ------------------
  |  Branch (1111:9): [True: 311k, False: 259k]
  ------------------
 1112|       |
 1113|   311k|  if (i != num_frame_bufs) {
  ------------------
  |  Branch (1113:7): [True: 311k, False: 0]
  ------------------
 1114|   311k|    if (frame_bufs[i].buf.use_external_reference_buffers) {
  ------------------
  |  Branch (1114:9): [True: 0, False: 311k]
  ------------------
 1115|       |      // If this frame buffer's y_buffer, u_buffer, and v_buffer point to the
 1116|       |      // external reference buffers. Restore the buffer pointers to point to the
 1117|       |      // internally allocated memory.
 1118|      0|      YV12_BUFFER_CONFIG *ybf = &frame_bufs[i].buf;
 1119|      0|      ybf->y_buffer = ybf->store_buf_adr[0];
 1120|      0|      ybf->u_buffer = ybf->store_buf_adr[1];
 1121|      0|      ybf->v_buffer = ybf->store_buf_adr[2];
 1122|      0|      ybf->use_external_reference_buffers = 0;
 1123|      0|    }
 1124|       |
 1125|   311k|    frame_bufs[i].ref_count = 1;
 1126|   311k|  } else {
 1127|       |    // We should never run out of free buffers. If this assertion fails, there
 1128|       |    // is a reference leak.
 1129|      0|    assert(0 && "Ran out of free frame buffers. Likely a reference leak.");
 1130|       |    // Reset i to be INVALID_IDX to indicate no free buffer found.
 1131|      0|    i = INVALID_IDX;
  ------------------
  |  |   15|      0|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
 1132|      0|  }
 1133|       |
 1134|   311k|  unlock_buffer_pool(cm->buffer_pool);
 1135|   311k|  return i;
 1136|   311k|}
decoder.c:lock_buffer_pool:
 1082|   623k|static void lock_buffer_pool(BufferPool *const pool) {
 1083|   623k|#if CONFIG_MULTITHREAD
 1084|   623k|  pthread_mutex_lock(&pool->pool_mutex);
 1085|       |#else
 1086|       |  (void)pool;
 1087|       |#endif
 1088|   623k|}
decoder.c:unlock_buffer_pool:
 1090|   623k|static void unlock_buffer_pool(BufferPool *const pool) {
 1091|   623k|#if CONFIG_MULTITHREAD
 1092|   623k|  pthread_mutex_unlock(&pool->pool_mutex);
 1093|       |#else
 1094|       |  (void)pool;
 1095|       |#endif
 1096|   623k|}
obu.c:is_valid_seq_level_idx:
 1876|  94.3k|static inline int is_valid_seq_level_idx(AV1_LEVEL seq_level_idx) {
 1877|  94.3k|  return seq_level_idx == SEQ_LEVEL_MAX ||
  ------------------
  |  Branch (1877:10): [True: 2.75k, False: 91.6k]
  ------------------
 1878|  94.3k|         (seq_level_idx < SEQ_LEVELS &&
  ------------------
  |  Branch (1878:11): [True: 91.3k, False: 237]
  ------------------
 1879|       |          // The following levels are currently undefined.
 1880|  91.6k|          seq_level_idx != SEQ_LEVEL_2_2 && seq_level_idx != SEQ_LEVEL_2_3 &&
  ------------------
  |  Branch (1880:11): [True: 91.3k, False: 67]
  |  Branch (1880:45): [True: 90.4k, False: 888]
  ------------------
 1881|  91.6k|          seq_level_idx != SEQ_LEVEL_3_2 && seq_level_idx != SEQ_LEVEL_3_3 &&
  ------------------
  |  Branch (1881:11): [True: 90.3k, False: 75]
  |  Branch (1881:45): [True: 90.2k, False: 59]
  ------------------
 1882|  91.6k|          seq_level_idx != SEQ_LEVEL_4_2 && seq_level_idx != SEQ_LEVEL_4_3
  ------------------
  |  Branch (1882:11): [True: 90.1k, False: 156]
  |  Branch (1882:45): [True: 90.1k, False: 25]
  ------------------
 1883|  91.6k|#if !CONFIG_CWG_C013
 1884|  91.6k|          && seq_level_idx != SEQ_LEVEL_7_0 && seq_level_idx != SEQ_LEVEL_7_1 &&
  ------------------
  |  Branch (1884:14): [True: 89.2k, False: 884]
  |  Branch (1884:48): [True: 89.0k, False: 198]
  ------------------
 1885|  91.6k|          seq_level_idx != SEQ_LEVEL_7_2 && seq_level_idx != SEQ_LEVEL_7_3 &&
  ------------------
  |  Branch (1885:11): [True: 88.9k, False: 80]
  |  Branch (1885:45): [True: 88.8k, False: 74]
  ------------------
 1886|  91.6k|          seq_level_idx != SEQ_LEVEL_8_0 && seq_level_idx != SEQ_LEVEL_8_1 &&
  ------------------
  |  Branch (1886:11): [True: 88.7k, False: 101]
  |  Branch (1886:45): [True: 88.5k, False: 256]
  ------------------
 1887|  91.6k|          seq_level_idx != SEQ_LEVEL_8_2 && seq_level_idx != SEQ_LEVEL_8_3
  ------------------
  |  Branch (1887:11): [True: 88.3k, False: 116]
  |  Branch (1887:45): [True: 87.0k, False: 1.34k]
  ------------------
 1888|  91.6k|#endif
 1889|  91.6k|         );
 1890|  94.3k|}
alloccommon.c:av1_num_planes:
 1271|   121k|static inline int av1_num_planes(const AV1_COMMON *cm) {
 1272|   121k|  return cm->seq_params->monochrome ? 1 : MAX_MB_PLANE;
  ------------------
  |  |   36|   106k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (1272:10): [True: 14.5k, False: 106k]
  ------------------
 1273|   121k|}
alloccommon.c:calc_mi_size:
 1339|  56.1k|static inline int calc_mi_size(int len) {
 1340|       |  // len is in mi units. Align to a multiple of SBs.
 1341|  56.1k|  return ALIGN_POWER_OF_TWO(len, MAX_MIB_SIZE_LOG2);
  ------------------
  |  |   69|  56.1k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
 1342|  56.1k|}
blockd.c:max_block_wide:
 1566|  1.27M|                                 int plane) {
 1567|  1.27M|  assert(bsize < BLOCK_SIZES_ALL);
 1568|  1.27M|  int max_blocks_wide = block_size_wide[bsize];
 1569|       |
 1570|  1.27M|  if (xd->mb_to_right_edge < 0) {
  ------------------
  |  Branch (1570:7): [True: 1.23M, False: 35.0k]
  ------------------
 1571|  1.23M|    const struct macroblockd_plane *const pd = &xd->plane[plane];
 1572|  1.23M|    max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
 1573|  1.23M|  }
 1574|       |
 1575|       |  // Scale the width in the transform block unit.
 1576|  1.27M|  return max_blocks_wide >> MI_SIZE_LOG2;
  ------------------
  |  |   39|  1.27M|#define MI_SIZE_LOG2 2
  ------------------
 1577|  1.27M|}
blockd.c:max_block_high:
 1580|   783k|                                 int plane) {
 1581|   783k|  int max_blocks_high = block_size_high[bsize];
 1582|       |
 1583|   783k|  if (xd->mb_to_bottom_edge < 0) {
  ------------------
  |  Branch (1583:7): [True: 783k, False: 0]
  ------------------
 1584|   783k|    const struct macroblockd_plane *const pd = &xd->plane[plane];
 1585|   783k|    max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
 1586|   783k|  }
 1587|       |
 1588|       |  // Scale the height in the transform block unit.
 1589|   783k|  return max_blocks_high >> MI_SIZE_LOG2;
  ------------------
  |  |   39|   783k|#define MI_SIZE_LOG2 2
  ------------------
 1590|   783k|}
cdef.c:av1_num_planes:
 1271|   672k|static inline int av1_num_planes(const AV1_COMMON *cm) {
 1272|   672k|  return cm->seq_params->monochrome ? 1 : MAX_MB_PLANE;
  ------------------
  |  |   36|   635k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (1272:10): [True: 37.1k, False: 635k]
  ------------------
 1273|   672k|}
cfl.c:max_block_wide:
 1566|   745k|                                 int plane) {
 1567|   745k|  assert(bsize < BLOCK_SIZES_ALL);
 1568|   745k|  int max_blocks_wide = block_size_wide[bsize];
 1569|       |
 1570|   745k|  if (xd->mb_to_right_edge < 0) {
  ------------------
  |  Branch (1570:7): [True: 0, False: 745k]
  ------------------
 1571|      0|    const struct macroblockd_plane *const pd = &xd->plane[plane];
 1572|      0|    max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
 1573|      0|  }
 1574|       |
 1575|       |  // Scale the width in the transform block unit.
 1576|   745k|  return max_blocks_wide >> MI_SIZE_LOG2;
  ------------------
  |  |   39|   745k|#define MI_SIZE_LOG2 2
  ------------------
 1577|   745k|}
cfl.c:max_block_high:
 1580|   745k|                                 int plane) {
 1581|   745k|  int max_blocks_high = block_size_high[bsize];
 1582|       |
 1583|   745k|  if (xd->mb_to_bottom_edge < 0) {
  ------------------
  |  Branch (1583:7): [True: 0, False: 745k]
  ------------------
 1584|      0|    const struct macroblockd_plane *const pd = &xd->plane[plane];
 1585|      0|    max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
 1586|      0|  }
 1587|       |
 1588|       |  // Scale the height in the transform block unit.
 1589|   745k|  return max_blocks_high >> MI_SIZE_LOG2;
  ------------------
  |  |   39|   745k|#define MI_SIZE_LOG2 2
  ------------------
 1590|   745k|}
cfl.c:get_tx_size:
 1708|   745k|static inline TX_SIZE get_tx_size(int width, int height) {
 1709|   745k|  if (width == height) {
  ------------------
  |  Branch (1709:7): [True: 177k, False: 567k]
  ------------------
 1710|   177k|    return get_sqr_tx_size(width);
 1711|   177k|  }
 1712|   567k|  if (width < height) {
  ------------------
  |  Branch (1712:7): [True: 250k, False: 317k]
  ------------------
 1713|   250k|    if (width + width == height) {
  ------------------
  |  Branch (1713:9): [True: 114k, False: 135k]
  ------------------
 1714|   114k|      switch (width) {
  ------------------
  |  Branch (1714:15): [True: 0, False: 114k]
  ------------------
 1715|   114k|        case 4: return TX_4X8; break;
  ------------------
  |  Branch (1715:9): [True: 114k, False: 0]
  ------------------
 1716|      0|        case 8: return TX_8X16; break;
  ------------------
  |  Branch (1716:9): [True: 0, False: 114k]
  ------------------
 1717|      0|        case 16: return TX_16X32; break;
  ------------------
  |  Branch (1717:9): [True: 0, False: 114k]
  ------------------
 1718|      0|        case 32: return TX_32X64; break;
  ------------------
  |  Branch (1718:9): [True: 0, False: 114k]
  ------------------
 1719|   114k|      }
 1720|   135k|    } else {
 1721|   135k|      switch (width) {
  ------------------
  |  Branch (1721:15): [True: 0, False: 135k]
  ------------------
 1722|   135k|        case 4: return TX_4X16; break;
  ------------------
  |  Branch (1722:9): [True: 135k, False: 0]
  ------------------
 1723|      0|        case 8: return TX_8X32; break;
  ------------------
  |  Branch (1723:9): [True: 0, False: 135k]
  ------------------
 1724|      0|        case 16: return TX_16X64; break;
  ------------------
  |  Branch (1724:9): [True: 0, False: 135k]
  ------------------
 1725|   135k|      }
 1726|   135k|    }
 1727|   317k|  } else {
 1728|   317k|    if (height + height == width) {
  ------------------
  |  Branch (1728:9): [True: 142k, False: 175k]
  ------------------
 1729|   142k|      switch (height) {
  ------------------
  |  Branch (1729:15): [True: 0, False: 142k]
  ------------------
 1730|   142k|        case 4: return TX_8X4; break;
  ------------------
  |  Branch (1730:9): [True: 142k, False: 0]
  ------------------
 1731|      0|        case 8: return TX_16X8; break;
  ------------------
  |  Branch (1731:9): [True: 0, False: 142k]
  ------------------
 1732|      0|        case 16: return TX_32X16; break;
  ------------------
  |  Branch (1732:9): [True: 0, False: 142k]
  ------------------
 1733|      0|        case 32: return TX_64X32; break;
  ------------------
  |  Branch (1733:9): [True: 0, False: 142k]
  ------------------
 1734|   142k|      }
 1735|   175k|    } else {
 1736|   175k|      switch (height) {
  ------------------
  |  Branch (1736:15): [True: 18.4E, False: 175k]
  ------------------
 1737|   175k|        case 4: return TX_16X4; break;
  ------------------
  |  Branch (1737:9): [True: 175k, False: 18.4E]
  ------------------
 1738|      0|        case 8: return TX_32X8; break;
  ------------------
  |  Branch (1738:9): [True: 0, False: 175k]
  ------------------
 1739|      0|        case 16: return TX_64X16; break;
  ------------------
  |  Branch (1739:9): [True: 0, False: 175k]
  ------------------
 1740|   175k|      }
 1741|   175k|    }
 1742|   317k|  }
 1743|      0|  assert(0);
 1744|      0|  return TX_4X4;
 1745|      0|}
cfl.c:get_sqr_tx_size:
 1697|   177k|static inline TX_SIZE get_sqr_tx_size(int tx_dim) {
 1698|   177k|  switch (tx_dim) {
 1699|      0|    case 128:
  ------------------
  |  Branch (1699:5): [True: 0, False: 177k]
  ------------------
 1700|      0|    case 64: return TX_64X64; break;
  ------------------
  |  Branch (1700:5): [True: 0, False: 177k]
  ------------------
 1701|      0|    case 32: return TX_32X32; break;
  ------------------
  |  Branch (1701:5): [True: 0, False: 177k]
  ------------------
 1702|      0|    case 16: return TX_16X16; break;
  ------------------
  |  Branch (1702:5): [True: 0, False: 177k]
  ------------------
 1703|      0|    case 8: return TX_8X8; break;
  ------------------
  |  Branch (1703:5): [True: 0, False: 177k]
  ------------------
 1704|   177k|    default: return TX_4X4;
  ------------------
  |  Branch (1704:5): [True: 177k, False: 0]
  ------------------
 1705|   177k|  }
 1706|   177k|}
entropymode.c:get_ref_frame_buf:
 1194|   241k|    const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
 1195|   241k|  const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
 1196|   241k|  return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL;
  ------------------
  |  |   15|   241k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  |  Branch (1196:10): [True: 81.8k, False: 159k]
  ------------------
 1197|   241k|}
entropymode.c:get_ref_frame_map_idx:
 1187|   241k|                                        const MV_REFERENCE_FRAME ref_frame) {
 1188|   241k|  return (ref_frame >= LAST_FRAME && ref_frame <= EXTREF_FRAME)
  ------------------
  |  Branch (1188:11): [True: 241k, False: 0]
  |  Branch (1188:38): [True: 241k, False: 0]
  ------------------
 1189|   241k|             ? cm->remapped_ref_idx[ref_frame - LAST_FRAME]
 1190|   241k|             : INVALID_IDX;
  ------------------
  |  |   15|      0|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
 1191|   241k|}
mvref_common.c:get_ref_frame_buf:
 1194|  6.60M|    const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
 1195|  6.60M|  const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
 1196|  6.60M|  return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL;
  ------------------
  |  |   15|  6.60M|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  |  Branch (1196:10): [True: 4.41M, False: 2.18M]
  ------------------
 1197|  6.60M|}
mvref_common.c:get_ref_frame_map_idx:
 1187|  6.54M|                                        const MV_REFERENCE_FRAME ref_frame) {
 1188|  6.56M|  return (ref_frame >= LAST_FRAME && ref_frame <= EXTREF_FRAME)
  ------------------
  |  Branch (1188:11): [True: 6.56M, False: 18.4E]
  |  Branch (1188:38): [True: 6.56M, False: 0]
  ------------------
 1189|  6.54M|             ? cm->remapped_ref_idx[ref_frame - LAST_FRAME]
 1190|  6.54M|             : INVALID_IDX;
  ------------------
  |  |   15|      0|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
 1191|  6.54M|}
mvref_common.c:frame_is_intra_only:
 1174|   130k|static inline int frame_is_intra_only(const AV1_COMMON *const cm) {
 1175|   130k|  return cm->current_frame.frame_type == KEY_FRAME ||
  ------------------
  |  Branch (1175:10): [True: 57.3k, False: 72.9k]
  ------------------
 1176|   130k|         cm->current_frame.frame_type == INTRA_ONLY_FRAME;
  ------------------
  |  Branch (1176:10): [True: 26.8k, False: 46.0k]
  ------------------
 1177|   130k|}
reconinter.c:get_ref_frame_buf:
 1194|  2.50M|    const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
 1195|  2.50M|  const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
 1196|  2.50M|  return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL;
  ------------------
  |  |   15|  2.50M|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  |  Branch (1196:10): [True: 2.50M, False: 52]
  ------------------
 1197|  2.50M|}
reconinter.c:get_ref_frame_map_idx:
 1187|  3.75M|                                        const MV_REFERENCE_FRAME ref_frame) {
 1188|  3.75M|  return (ref_frame >= LAST_FRAME && ref_frame <= EXTREF_FRAME)
  ------------------
  |  Branch (1188:11): [True: 3.75M, False: 18.4E]
  |  Branch (1188:38): [True: 3.75M, False: 18.4E]
  ------------------
 1189|  3.75M|             ? cm->remapped_ref_idx[ref_frame - LAST_FRAME]
 1190|  18.4E|             : INVALID_IDX;
  ------------------
  |  |   15|  18.4E|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
 1191|  3.75M|}
reconinter.c:av1_num_planes:
 1271|  4.51M|static inline int av1_num_planes(const AV1_COMMON *cm) {
 1272|  4.51M|  return cm->seq_params->monochrome ? 1 : MAX_MB_PLANE;
  ------------------
  |  |   36|  4.49M|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (1272:10): [True: 21.7k, False: 4.49M]
  ------------------
 1273|  4.51M|}
reconinter.c:get_ref_scale_factors_const:
 1202|  1.24M|    const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
 1203|  1.24M|  const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
 1204|  1.24M|  return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL;
  ------------------
  |  |   15|  1.24M|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  |  Branch (1204:10): [True: 1.24M, False: 49]
  ------------------
 1205|  1.24M|}
resize.c:av1_num_planes:
 1271|  19.7k|static inline int av1_num_planes(const AV1_COMMON *cm) {
 1272|  19.7k|  return cm->seq_params->monochrome ? 1 : MAX_MB_PLANE;
  ------------------
  |  |   36|  7.55k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (1272:10): [True: 12.2k, False: 7.55k]
  ------------------
 1273|  19.7k|}
resize.c:lock_buffer_pool:
 1082|  9.88k|static void lock_buffer_pool(BufferPool *const pool) {
 1083|  9.88k|#if CONFIG_MULTITHREAD
 1084|  9.88k|  pthread_mutex_lock(&pool->pool_mutex);
 1085|       |#else
 1086|       |  (void)pool;
 1087|       |#endif
 1088|  9.88k|}
resize.c:unlock_buffer_pool:
 1090|  9.88k|static void unlock_buffer_pool(BufferPool *const pool) {
 1091|  9.88k|#if CONFIG_MULTITHREAD
 1092|  9.88k|  pthread_mutex_unlock(&pool->pool_mutex);
 1093|       |#else
 1094|       |  (void)pool;
 1095|       |#endif
 1096|  9.88k|}
restoration.c:av1_num_planes:
 1271|  45.2k|static inline int av1_num_planes(const AV1_COMMON *cm) {
 1272|  45.2k|  return cm->seq_params->monochrome ? 1 : MAX_MB_PLANE;
  ------------------
  |  |   36|  36.1k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (1272:10): [True: 9.13k, False: 36.1k]
  ------------------
 1273|  45.2k|}
thread_common.c:av1_num_planes:
 1271|   905k|static inline int av1_num_planes(const AV1_COMMON *cm) {
 1272|   905k|  return cm->seq_params->monochrome ? 1 : MAX_MB_PLANE;
  ------------------
  |  |   36|   862k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (1272:10): [True: 42.8k, False: 862k]
  ------------------
 1273|   905k|}
thread_common.c:frame_is_intra_only:
 1174|  91.5k|static inline int frame_is_intra_only(const AV1_COMMON *const cm) {
 1175|  91.5k|  return cm->current_frame.frame_type == KEY_FRAME ||
  ------------------
  |  Branch (1175:10): [True: 38.6k, False: 52.8k]
  ------------------
 1176|  91.5k|         cm->current_frame.frame_type == INTRA_ONLY_FRAME;
  ------------------
  |  Branch (1176:10): [True: 6.80k, False: 46.0k]
  ------------------
 1177|  91.5k|}

av1_highbd_iwht4x4_1_add_c:
   82|   630k|                                int dest_stride, int bd) {
   83|   630k|  int i;
   84|   630k|  tran_low_t a1, e1;
   85|   630k|  tran_low_t tmp[4];
   86|   630k|  const tran_low_t *ip = in;
   87|   630k|  tran_low_t *op = tmp;
   88|   630k|  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  ------------------
  |  |   75|   630k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
   89|   630k|  (void)bd;
   90|       |
   91|   630k|  a1 = ip[0 * 4] >> UNIT_QUANT_SHIFT;
  ------------------
  |  |   21|   630k|#define UNIT_QUANT_SHIFT 2
  ------------------
   92|   630k|  e1 = a1 >> 1;
   93|   630k|  a1 -= e1;
   94|   630k|  op[0] = a1;
   95|   630k|  op[1] = op[2] = op[3] = e1;
   96|       |
   97|   630k|  ip = tmp;
   98|  3.15M|  for (i = 0; i < 4; i++) {
  ------------------
  |  Branch (98:15): [True: 2.52M, False: 630k]
  ------------------
   99|  2.52M|    e1 = ip[0] >> 1;
  100|  2.52M|    a1 = ip[0] - e1;
  101|  2.52M|    dest[dest_stride * 0] =
  102|  2.52M|        highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd);
  103|  2.52M|    dest[dest_stride * 1] =
  104|  2.52M|        highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd);
  105|  2.52M|    dest[dest_stride * 2] =
  106|  2.52M|        highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd);
  107|  2.52M|    dest[dest_stride * 3] =
  108|  2.52M|        highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd);
  109|  2.52M|    ip++;
  110|  2.52M|    dest++;
  111|  2.52M|  }
  112|   630k|}

av1_loop_filter_init:
  110|  16.1k|void av1_loop_filter_init(AV1_COMMON *cm) {
  111|  16.1k|  assert(MB_MODE_COUNT == NELEMENTS(mode_lf_lut));
  112|  16.1k|  loop_filter_info_n *lfi = &cm->lf_info;
  113|  16.1k|  struct loopfilter *lf = &cm->lf;
  114|  16.1k|  int lvl;
  115|       |
  116|       |  // init limits for given sharpness
  117|  16.1k|  update_sharpness(lfi, lf->sharpness_level);
  118|       |
  119|       |  // init hev threshold const vectors
  120|  1.04M|  for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++)
  ------------------
  |  |   27|  1.04M|#define MAX_LOOP_FILTER 63
  ------------------
  |  Branch (120:17): [True: 1.03M, False: 16.1k]
  ------------------
  121|  1.03M|    memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
  ------------------
  |  |   30|  1.03M|#define SIMD_WIDTH 16
  ------------------
  122|  16.1k|}
av1_loop_filter_frame_init:
  128|  30.0k|                                int plane_end) {
  129|  30.0k|  int filt_lvl[MAX_MB_PLANE], filt_lvl_r[MAX_MB_PLANE];
  130|  30.0k|  int plane;
  131|  30.0k|  int seg_id;
  132|       |  // n_shift is the multiplier for lf_deltas
  133|       |  // the multiplier is 1 for when filter_lvl is between 0 and 31;
  134|       |  // 2 when filter_lvl is between 32 and 63
  135|  30.0k|  loop_filter_info_n *const lfi = &cm->lf_info;
  136|  30.0k|  struct loopfilter *const lf = &cm->lf;
  137|  30.0k|  const struct segmentation *const seg = &cm->seg;
  138|       |
  139|       |  // update sharpness limits
  140|  30.0k|  update_sharpness(lfi, lf->sharpness_level);
  141|       |
  142|  30.0k|  filt_lvl[0] = cm->lf.filter_level[0];
  143|  30.0k|  filt_lvl[1] = cm->lf.filter_level_u;
  144|  30.0k|  filt_lvl[2] = cm->lf.filter_level_v;
  145|       |
  146|  30.0k|  filt_lvl_r[0] = cm->lf.filter_level[1];
  147|  30.0k|  filt_lvl_r[1] = cm->lf.filter_level_u;
  148|  30.0k|  filt_lvl_r[2] = cm->lf.filter_level_v;
  149|       |
  150|  30.0k|  assert(plane_start >= AOM_PLANE_Y);
  151|  30.0k|  assert(plane_end <= MAX_MB_PLANE);
  152|       |
  153|   113k|  for (plane = plane_start; plane < plane_end; plane++) {
  ------------------
  |  Branch (153:29): [True: 83.4k, False: 30.0k]
  ------------------
  154|  83.4k|    if (plane == 0 && !filt_lvl[0] && !filt_lvl_r[0])
  ------------------
  |  Branch (154:9): [True: 30.0k, False: 53.4k]
  |  Branch (154:23): [True: 6.32k, False: 23.6k]
  |  Branch (154:39): [True: 0, False: 6.32k]
  ------------------
  155|      0|      break;
  156|  83.4k|    else if (plane == 1 && !filt_lvl[1])
  ------------------
  |  Branch (156:14): [True: 26.7k, False: 56.7k]
  |  Branch (156:28): [True: 4.44k, False: 22.2k]
  ------------------
  157|  4.44k|      continue;
  158|  79.0k|    else if (plane == 2 && !filt_lvl[2])
  ------------------
  |  Branch (158:14): [True: 26.7k, False: 52.3k]
  |  Branch (158:28): [True: 5.98k, False: 20.7k]
  ------------------
  159|  5.98k|      continue;
  160|       |
  161|   657k|    for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
  ------------------
  |  |   21|   657k|#define MAX_SEGMENTS 8
  ------------------
  |  Branch (161:22): [True: 584k, False: 73.0k]
  ------------------
  162|  1.75M|      for (int dir = 0; dir < 2; ++dir) {
  ------------------
  |  Branch (162:25): [True: 1.16M, False: 584k]
  ------------------
  163|  1.16M|        int lvl_seg = (dir == 0) ? filt_lvl[plane] : filt_lvl_r[plane];
  ------------------
  |  Branch (163:23): [True: 584k, False: 584k]
  ------------------
  164|  1.16M|        const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir];
  165|  1.16M|        if (segfeature_active(seg, seg_id, seg_lf_feature_id)) {
  ------------------
  |  Branch (165:13): [True: 60.4k, False: 1.10M]
  ------------------
  166|  60.4k|          const int data = get_segdata(&cm->seg, seg_id, seg_lf_feature_id);
  167|  60.4k|          lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER);
  ------------------
  |  |   27|  60.4k|#define MAX_LOOP_FILTER 63
  ------------------
  168|  60.4k|        }
  169|       |
  170|  1.16M|        if (!lf->mode_ref_delta_enabled) {
  ------------------
  |  Branch (170:13): [True: 742k, False: 426k]
  ------------------
  171|       |          // we could get rid of this if we assume that deltas are set to
  172|       |          // zero when not in use; encoder always uses deltas
  173|   742k|          memset(lfi->lvl[plane][seg_id][dir], lvl_seg,
  174|   742k|                 sizeof(lfi->lvl[plane][seg_id][dir]));
  175|   742k|        } else {
  176|   426k|          int ref, mode;
  177|   426k|          const int scale = 1 << (lvl_seg >> 5);
  178|   426k|          const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale;
  179|   426k|          lfi->lvl[plane][seg_id][dir][INTRA_FRAME][0] =
  180|   426k|              clamp(intra_lvl, 0, MAX_LOOP_FILTER);
  ------------------
  |  |   27|   426k|#define MAX_LOOP_FILTER 63
  ------------------
  181|       |
  182|  3.40M|          for (ref = LAST_FRAME; ref < REF_FRAMES; ++ref) {
  ------------------
  |  Branch (182:34): [True: 2.98M, False: 426k]
  ------------------
  183|  8.94M|            for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
  ------------------
  |  |   74|  8.94M|#define MAX_MODE_LF_DELTAS 2
  ------------------
  |  Branch (183:28): [True: 5.96M, False: 2.98M]
  ------------------
  184|  5.96M|              const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale +
  185|  5.96M|                                    lf->mode_deltas[mode] * scale;
  186|  5.96M|              lfi->lvl[plane][seg_id][dir][ref][mode] =
  187|  5.96M|                  clamp(inter_lvl, 0, MAX_LOOP_FILTER);
  ------------------
  |  |   27|  5.96M|#define MAX_LOOP_FILTER 63
  ------------------
  188|  5.96M|            }
  189|  2.98M|          }
  190|   426k|        }
  191|  1.16M|      }
  192|   584k|    }
  193|  73.0k|  }
  194|  30.0k|}
av1_filter_block_plane_vert:
 1308|   625k|                                 const uint32_t mi_row, const uint32_t mi_col) {
 1309|   625k|  const uint32_t scale_horz = plane_ptr->subsampling_x;
 1310|   625k|  const uint32_t scale_vert = plane_ptr->subsampling_y;
 1311|   625k|  uint8_t *const dst_ptr = plane_ptr->dst.buf;
 1312|   625k|  const int dst_stride = plane_ptr->dst.stride;
 1313|   625k|  const int plane_mi_rows =
 1314|   625k|      ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert);
  ------------------
  |  |   41|   625k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
 1315|   625k|  const int plane_mi_cols =
 1316|   625k|      ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz);
  ------------------
  |  |   41|   625k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
 1317|   625k|  const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)),
  ------------------
  |  |   34|   625k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 139k, False: 485k]
  |  |  ------------------
  ------------------
 1318|   625k|                             (MAX_MIB_SIZE >> scale_vert));
 1319|   625k|  const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)),
  ------------------
  |  |   34|   625k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 156k, False: 468k]
  |  |  ------------------
  ------------------
 1320|   625k|                             (MAX_MIB_SIZE >> scale_horz));
 1321|       |
 1322|  11.0M|  for (int y = 0; y < y_range; y++) {
  ------------------
  |  Branch (1322:19): [True: 10.4M, False: 625k]
  ------------------
 1323|  10.4M|    uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
  ------------------
  |  |   40|  10.4M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  10.4M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1324|  63.3M|    for (int x = 0; x < x_range;) {
  ------------------
  |  Branch (1324:21): [True: 52.9M, False: 10.4M]
  ------------------
 1325|       |      // inner loop always filter vertical edges in a MI block. If MI size
 1326|       |      // is 8x8, it will filter the vertical edge aligned with a 8x8 block.
 1327|       |      // If 4x4 transform is used, it will then filter the internal edge
 1328|       |      //  aligned with a 4x4 block
 1329|  52.9M|      const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
  ------------------
  |  |   40|  52.9M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  52.9M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
                    const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
  ------------------
  |  |   40|  52.9M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  52.9M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1330|  52.9M|      const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
  ------------------
  |  |   40|  52.9M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  52.9M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
                    const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
  ------------------
  |  |   40|  52.9M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  52.9M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1331|  52.9M|      uint32_t advance_units;
 1332|  52.9M|      TX_SIZE tx_size;
 1333|  52.9M|      AV1_DEBLOCKING_PARAMETERS params;
 1334|  52.9M|      memset(&params, 0, sizeof(params));
 1335|       |
 1336|  52.9M|      tx_size =
 1337|  52.9M|          set_lpf_parameters(&params, ((ptrdiff_t)1 << scale_horz), cm, xd,
 1338|  52.9M|                             VERT_EDGE, curr_x, curr_y, plane, plane_ptr);
 1339|  52.9M|      if (tx_size == TX_INVALID) {
  ------------------
  |  Branch (1339:11): [True: 0, False: 52.9M]
  ------------------
 1340|      0|        params.filter_length = 0;
 1341|      0|        tx_size = TX_4X4;
 1342|      0|      }
 1343|       |
 1344|  52.9M|      filter_vert(p, dst_stride, &params, cm->seq_params, USE_SINGLE);
 1345|       |
 1346|       |      // advance the destination pointer
 1347|  52.9M|      advance_units = tx_size_wide_unit[tx_size];
 1348|  52.9M|      x += advance_units;
 1349|  52.9M|      p += advance_units * MI_SIZE;
  ------------------
  |  |   40|  52.9M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  52.9M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1350|  52.9M|    }
 1351|  10.4M|  }
 1352|   625k|}
av1_filter_block_plane_horz:
 1910|   628k|                                 const uint32_t mi_row, const uint32_t mi_col) {
 1911|   628k|  const uint32_t scale_horz = plane_ptr->subsampling_x;
 1912|   628k|  const uint32_t scale_vert = plane_ptr->subsampling_y;
 1913|   628k|  uint8_t *const dst_ptr = plane_ptr->dst.buf;
 1914|   628k|  const int dst_stride = plane_ptr->dst.stride;
 1915|   628k|  const int plane_mi_rows =
 1916|   628k|      ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert);
  ------------------
  |  |   41|   628k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
 1917|   628k|  const int plane_mi_cols =
 1918|   628k|      ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz);
  ------------------
  |  |   41|   628k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
 1919|   628k|  const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)),
  ------------------
  |  |   34|   628k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 140k, False: 488k]
  |  |  ------------------
  ------------------
 1920|   628k|                             (MAX_MIB_SIZE >> scale_vert));
 1921|   628k|  const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)),
  ------------------
  |  |   34|   628k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 156k, False: 471k]
  |  |  ------------------
  ------------------
 1922|   628k|                             (MAX_MIB_SIZE >> scale_horz));
 1923|  13.2M|  for (int x = 0; x < x_range; x++) {
  ------------------
  |  Branch (1923:19): [True: 12.5M, False: 628k]
  ------------------
 1924|  12.5M|    uint8_t *p = dst_ptr + x * MI_SIZE;
  ------------------
  |  |   40|  12.5M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  12.5M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1925|  85.9M|    for (int y = 0; y < y_range;) {
  ------------------
  |  Branch (1925:21): [True: 73.3M, False: 12.5M]
  ------------------
 1926|       |      // inner loop always filter vertical edges in a MI block. If MI size
 1927|       |      // is 8x8, it will first filter the vertical edge aligned with a 8x8
 1928|       |      // block. If 4x4 transform is used, it will then filter the internal
 1929|       |      // edge aligned with a 4x4 block
 1930|  73.3M|      const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
  ------------------
  |  |   40|  73.3M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  73.3M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
                    const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
  ------------------
  |  |   40|  73.3M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  73.3M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1931|  73.3M|      const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
  ------------------
  |  |   40|  73.3M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  73.3M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
                    const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
  ------------------
  |  |   40|  73.3M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  73.3M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1932|  73.3M|      uint32_t advance_units;
 1933|  73.3M|      TX_SIZE tx_size;
 1934|  73.3M|      AV1_DEBLOCKING_PARAMETERS params;
 1935|  73.3M|      memset(&params, 0, sizeof(params));
 1936|       |
 1937|  73.3M|      tx_size = set_lpf_parameters(
 1938|  73.3M|          &params, (cm->mi_params.mi_stride << scale_vert), cm, xd, HORZ_EDGE,
 1939|  73.3M|          curr_x, curr_y, plane, plane_ptr);
 1940|  73.3M|      if (tx_size == TX_INVALID) {
  ------------------
  |  Branch (1940:11): [True: 0, False: 73.3M]
  ------------------
 1941|      0|        params.filter_length = 0;
 1942|      0|        tx_size = TX_4X4;
 1943|      0|      }
 1944|       |
 1945|  73.3M|      filter_horz(p, dst_stride, &params, cm->seq_params, USE_SINGLE);
 1946|       |
 1947|       |      // advance the destination pointer
 1948|  73.3M|      advance_units = tx_size_high_unit[tx_size];
 1949|  73.3M|      y += advance_units;
 1950|  73.3M|      p += advance_units * dst_stride * MI_SIZE;
  ------------------
  |  |   40|  73.3M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  73.3M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1951|  73.3M|    }
 1952|  12.5M|  }
 1953|   628k|}
av1_loopfilter.c:update_sharpness:
   47|  46.1k|static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
   48|  46.1k|  int lvl;
   49|       |
   50|       |  // For each possible value for the loop filter fill out limits
   51|  2.99M|  for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) {
  ------------------
  |  |   27|  2.99M|#define MAX_LOOP_FILTER 63
  ------------------
  |  Branch (51:17): [True: 2.95M, False: 46.1k]
  ------------------
   52|       |    // Set loop filter parameters that control sharpness.
   53|  2.95M|    int block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4));
   54|       |
   55|  2.95M|    if (sharpness_lvl > 0) {
  ------------------
  |  Branch (55:9): [True: 973k, False: 1.97M]
  ------------------
   56|   973k|      if (block_inside_limit > (9 - sharpness_lvl))
  ------------------
  |  Branch (56:11): [True: 763k, False: 209k]
  ------------------
   57|   763k|        block_inside_limit = (9 - sharpness_lvl);
   58|   973k|    }
   59|       |
   60|  2.95M|    if (block_inside_limit < 1) block_inside_limit = 1;
  ------------------
  |  Branch (60:9): [True: 77.1k, False: 2.87M]
  ------------------
   61|       |
   62|  2.95M|    memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH);
  ------------------
  |  |   30|  2.95M|#define SIMD_WIDTH 16
  ------------------
   63|  2.95M|    memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
   64|  2.95M|           SIMD_WIDTH);
  ------------------
  |  |   30|  2.95M|#define SIMD_WIDTH 16
  ------------------
   65|  2.95M|  }
   66|  46.1k|}
av1_loopfilter.c:set_lpf_parameters:
  228|   118M|    const int plane, const struct macroblockd_plane *const plane_ptr) {
  229|       |  // reset to initial values
  230|   118M|  params->filter_length = 0;
  231|       |
  232|       |  // no deblocking is required
  233|   118M|  const uint32_t width = plane_ptr->dst.width;
  234|   118M|  const uint32_t height = plane_ptr->dst.height;
  235|   123M|  if ((width <= x) || (height <= y)) {
  ------------------
  |  Branch (235:7): [True: 18.4E, False: 123M]
  |  Branch (235:23): [True: 1.05M, False: 122M]
  ------------------
  236|       |    // just return the smallest transform unit size
  237|  2.64M|    return TX_4X4;
  238|  2.64M|  }
  239|       |
  240|   116M|  const uint32_t scale_horz = plane_ptr->subsampling_x;
  241|   116M|  const uint32_t scale_vert = plane_ptr->subsampling_y;
  242|       |  // for sub8x8 block, chroma prediction mode is obtained from the bottom/right
  243|       |  // mi structure of the co-located 8x8 luma block. so for chroma plane, mi_row
  244|       |  // and mi_col should map to the bottom/right mi structure, i.e, both mi_row
  245|       |  // and mi_col should be odd number for chroma plane.
  246|   116M|  const int mi_row = scale_vert | ((y << scale_vert) >> MI_SIZE_LOG2);
  ------------------
  |  |   39|   116M|#define MI_SIZE_LOG2 2
  ------------------
  247|   116M|  const int mi_col = scale_horz | ((x << scale_horz) >> MI_SIZE_LOG2);
  ------------------
  |  |   39|   116M|#define MI_SIZE_LOG2 2
  ------------------
  248|   116M|  MB_MODE_INFO **mi =
  249|   116M|      cm->mi_params.mi_grid_base + mi_row * cm->mi_params.mi_stride + mi_col;
  250|   116M|  const MB_MODE_INFO *mbmi = mi[0];
  251|       |  // If current mbmi is not correctly setup, return an invalid value to stop
  252|       |  // filtering. One example is that if this tile is not coded, then its mbmi
  253|       |  // it not set up.
  254|   116M|  if (mbmi == NULL) return TX_INVALID;
  ------------------
  |  Branch (254:7): [True: 0, False: 116M]
  ------------------
  255|       |
  256|   116M|  const TX_SIZE ts = get_transform_size(xd, mi[0], mi_row, mi_col, plane,
  257|   116M|                                        scale_horz, scale_vert);
  258|       |
  259|   116M|  {
  260|   116M|    const uint32_t coord = (VERT_EDGE == edge_dir) ? (x) : (y);
  ------------------
  |  Branch (260:28): [True: 44.1M, False: 72.0M]
  ------------------
  261|   116M|    const uint32_t transform_masks =
  262|   116M|        edge_dir == VERT_EDGE ? tx_size_wide[ts] - 1 : tx_size_high[ts] - 1;
  ------------------
  |  Branch (262:9): [True: 40.7M, False: 75.4M]
  ------------------
  263|   116M|    const int32_t tu_edge = (coord & transform_masks) ? (0) : (1);
  ------------------
  |  Branch (263:29): [True: 0, False: 116M]
  ------------------
  264|       |
  265|   116M|    if (!tu_edge) return ts;
  ------------------
  |  Branch (265:9): [True: 0, False: 116M]
  ------------------
  266|       |
  267|       |    // prepare outer edge parameters. deblock the edge if it's an edge of a TU
  268|   116M|    {
  269|   116M|      const uint32_t curr_level =
  270|   116M|          get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi);
  271|   116M|      const int curr_skipped = mbmi->skip_txfm && is_inter_block(mbmi);
  ------------------
  |  Branch (271:32): [True: 85.7M, False: 30.4M]
  |  Branch (271:51): [True: 7.67M, False: 78.0M]
  ------------------
  272|   116M|      uint32_t level = curr_level;
  273|   130M|      if (coord) {
  ------------------
  |  Branch (273:11): [True: 130M, False: 18.4E]
  ------------------
  274|   130M|        {
  275|   130M|          const MB_MODE_INFO *const mi_prev = *(mi - mode_step);
  276|   130M|          if (mi_prev == NULL) return TX_INVALID;
  ------------------
  |  Branch (276:15): [True: 0, False: 130M]
  ------------------
  277|   130M|          const int pv_row =
  278|   130M|              (VERT_EDGE == edge_dir) ? (mi_row) : (mi_row - (1 << scale_vert));
  ------------------
  |  Branch (278:15): [True: 43.1M, False: 87.1M]
  ------------------
  279|   130M|          const int pv_col =
  280|   130M|              (VERT_EDGE == edge_dir) ? (mi_col - (1 << scale_horz)) : (mi_col);
  ------------------
  |  Branch (280:15): [True: 40.5M, False: 89.8M]
  ------------------
  281|   130M|          const TX_SIZE pv_ts = get_transform_size(
  282|   130M|              xd, mi_prev, pv_row, pv_col, plane, scale_horz, scale_vert);
  283|       |
  284|   130M|          const uint32_t pv_lvl =
  285|   130M|              get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev);
  286|       |
  287|   130M|          const int pv_skip_txfm =
  288|   130M|              mi_prev->skip_txfm && is_inter_block(mi_prev);
  ------------------
  |  Branch (288:15): [True: 84.3M, False: 46.0M]
  |  Branch (288:37): [True: 7.03M, False: 77.2M]
  ------------------
  289|   130M|          const BLOCK_SIZE bsize = get_plane_block_size(
  290|   130M|              mbmi->bsize, plane_ptr->subsampling_x, plane_ptr->subsampling_y);
  291|   130M|          assert(bsize < BLOCK_SIZES_ALL);
  292|   139M|          const int prediction_masks = edge_dir == VERT_EDGE
  ------------------
  |  Branch (292:40): [True: 66.1M, False: 73.0M]
  ------------------
  293|   139M|                                           ? block_size_wide[bsize] - 1
  294|   139M|                                           : block_size_high[bsize] - 1;
  295|   139M|          const int32_t pu_edge = !(coord & prediction_masks);
  296|       |          // if the current and the previous blocks are skipped,
  297|       |          // deblock the edge if the edge belongs to a PU's edge only.
  298|   139M|          if ((curr_level || pv_lvl) &&
  ------------------
  |  Branch (298:16): [True: 117M, False: 21.5M]
  |  Branch (298:30): [True: 18.4E, False: 21.6M]
  ------------------
  299|   139M|              (!pv_skip_txfm || !curr_skipped || pu_edge)) {
  ------------------
  |  Branch (299:16): [True: 122M, False: 6.80M]
  |  Branch (299:33): [True: 405k, False: 6.39M]
  |  Branch (299:50): [True: 4.79M, False: 1.60M]
  ------------------
  300|   128M|            const int dim = (VERT_EDGE == edge_dir)
  ------------------
  |  Branch (300:29): [True: 59.9M, False: 68.1M]
  ------------------
  301|   128M|                                ? AOMMIN(tx_size_wide_unit_log2[ts],
  ------------------
  |  |   34|  59.9M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 4.17M, False: 55.8M]
  |  |  ------------------
  ------------------
  302|   128M|                                         tx_size_wide_unit_log2[pv_ts])
  303|   128M|                                : AOMMIN(tx_size_high_unit_log2[ts],
  ------------------
  |  |   34|  68.1M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 5.72M, False: 62.4M]
  |  |  ------------------
  ------------------
  304|   128M|                                         tx_size_high_unit_log2[pv_ts]);
  305|   128M|            if (plane) {
  ------------------
  |  Branch (305:17): [True: 66.8M, False: 61.3M]
  ------------------
  306|  66.8M|              params->filter_length = (dim == 0) ? 4 : 6;
  ------------------
  |  Branch (306:39): [True: 45.2M, False: 21.5M]
  ------------------
  307|  66.8M|            } else {
  308|  61.3M|              assert(dim < TX_SIZES);
  309|  70.8M|              assert(dim >= 0);
  310|  70.8M|              params->filter_length = tx_dim_to_filter_length[dim];
  311|  70.8M|            }
  312|       |
  313|       |            // update the level if the current block is skipped,
  314|       |            // but the previous one is not
  315|   137M|            level = (curr_level) ? (curr_level) : (pv_lvl);
  ------------------
  |  Branch (315:21): [True: 121M, False: 16.0M]
  ------------------
  316|   137M|          }
  317|   139M|        }
  318|   139M|      }
  319|       |      // prepare common parameters
  320|   134M|      if (params->filter_length) {
  ------------------
  |  Branch (320:11): [True: 124M, False: 10.4M]
  ------------------
  321|   124M|        const loop_filter_thresh *const limits = cm->lf_info.lfthr + level;
  322|   124M|        params->lfthr = limits;
  323|   124M|      }
  324|   134M|    }
  325|   134M|  }
  326|       |
  327|      0|  return ts;
  328|   116M|}
av1_loopfilter.c:get_transform_size:
  199|   160M|                   const int ss_x, const int ss_y) {
  200|   160M|  assert(mbmi != NULL);
  201|   185M|  if (xd && xd->lossless[mbmi->segment_id]) return TX_4X4;
  ------------------
  |  Branch (201:7): [True: 185M, False: 18.4E]
  |  Branch (201:13): [True: 108M, False: 77.1M]
  ------------------
  202|       |
  203|  50.6M|  TX_SIZE tx_size = (plane == AOM_PLANE_Y)
  ------------------
  |  |  226|  50.6M|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  |  Branch (203:21): [True: 57.5M, False: 18.4E]
  ------------------
  204|  57.5M|                        ? mbmi->tx_size
  205|  18.4E|                        : av1_get_max_uv_txsize(mbmi->bsize, ss_x, ss_y);
  206|  50.6M|  assert(tx_size < TX_SIZES_ALL);
  207|  89.4M|  if ((plane == AOM_PLANE_Y) && is_inter_block(mbmi) && !mbmi->skip_txfm) {
  ------------------
  |  |  226|  89.4M|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  |  Branch (207:7): [True: 58.1M, False: 31.2M]
  |  Branch (207:33): [True: 10.0M, False: 48.0M]
  |  Branch (207:57): [True: 2.64M, False: 7.46M]
  ------------------
  208|  2.64M|    const BLOCK_SIZE sb_type = mbmi->bsize;
  209|  2.64M|    const int blk_row = mi_row & (mi_size_high[sb_type] - 1);
  210|  2.64M|    const int blk_col = mi_col & (mi_size_wide[sb_type] - 1);
  211|  2.64M|    const TX_SIZE mb_tx_size =
  212|  2.64M|        mbmi->inter_tx_size[av1_get_txb_size_index(sb_type, blk_row, blk_col)];
  213|  2.64M|    assert(mb_tx_size < TX_SIZES_ALL);
  214|  2.67M|    tx_size = mb_tx_size;
  215|  2.67M|  }
  216|       |
  217|  89.5M|  return tx_size;
  218|  89.4M|}
av1_loopfilter.c:get_filter_level:
   71|   211M|                                const MB_MODE_INFO *mbmi) {
   72|   211M|  const int segment_id = mbmi->segment_id;
   73|   211M|  if (cm->delta_q_info.delta_lf_present_flag) {
  ------------------
  |  Branch (73:7): [True: 116M, False: 95.6M]
  ------------------
   74|   116M|    int8_t delta_lf;
   75|   116M|    if (cm->delta_q_info.delta_lf_multi) {
  ------------------
  |  Branch (75:9): [True: 4.65M, False: 111M]
  ------------------
   76|  4.65M|      const int delta_lf_idx = delta_lf_id_lut[plane][dir_idx];
   77|  4.65M|      delta_lf = mbmi->delta_lf[delta_lf_idx];
   78|   111M|    } else {
   79|   111M|      delta_lf = mbmi->delta_lf_from_base;
   80|   111M|    }
   81|   116M|    int base_level;
   82|   116M|    if (plane == 0)
  ------------------
  |  Branch (82:9): [True: 61.1M, False: 55.0M]
  ------------------
   83|  61.1M|      base_level = cm->lf.filter_level[dir_idx];
   84|  55.0M|    else if (plane == 1)
  ------------------
  |  Branch (84:14): [True: 59.1M, False: 18.4E]
  ------------------
   85|  59.1M|      base_level = cm->lf.filter_level_u;
   86|  18.4E|    else
   87|  18.4E|      base_level = cm->lf.filter_level_v;
   88|   116M|    int lvl_seg = clamp(delta_lf + base_level, 0, MAX_LOOP_FILTER);
  ------------------
  |  |   27|   116M|#define MAX_LOOP_FILTER 63
  ------------------
   89|   116M|    assert(plane >= 0 && plane <= 2);
   90|   112M|    const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir_idx];
   91|   112M|    if (segfeature_active(&cm->seg, segment_id, seg_lf_feature_id)) {
  ------------------
  |  Branch (91:9): [True: 96.7M, False: 15.8M]
  ------------------
   92|  96.7M|      const int data = get_segdata(&cm->seg, segment_id, seg_lf_feature_id);
   93|  96.7M|      lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER);
  ------------------
  |  |   27|  96.7M|#define MAX_LOOP_FILTER 63
  ------------------
   94|  96.7M|    }
   95|       |
   96|   112M|    if (cm->lf.mode_ref_delta_enabled) {
  ------------------
  |  Branch (96:9): [True: 58.9M, False: 53.5M]
  ------------------
   97|  58.9M|      const int scale = 1 << (lvl_seg >> 5);
   98|  58.9M|      lvl_seg += cm->lf.ref_deltas[mbmi->ref_frame[0]] * scale;
   99|  58.9M|      if (mbmi->ref_frame[0] > INTRA_FRAME)
  ------------------
  |  Branch (99:11): [True: 4.35M, False: 54.6M]
  ------------------
  100|  4.35M|        lvl_seg += cm->lf.mode_deltas[mode_lf_lut[mbmi->mode]] * scale;
  101|  58.9M|      lvl_seg = clamp(lvl_seg, 0, MAX_LOOP_FILTER);
  ------------------
  |  |   27|  58.9M|#define MAX_LOOP_FILTER 63
  ------------------
  102|  58.9M|    }
  103|   112M|    return lvl_seg;
  104|   116M|  } else {
  105|  95.6M|    return lfi_n->lvl[plane][segment_id][dir_idx][mbmi->ref_frame[0]]
  106|  95.6M|                     [mode_lf_lut[mbmi->mode]];
  107|  95.6M|  }
  108|   211M|}
av1_loopfilter.c:filter_vert:
  909|  69.2M|                               USE_FILTER_TYPE use_filter_type) {
  910|  69.2M|  const loop_filter_thresh *limits = params->lfthr;
  911|  69.2M|#if CONFIG_AV1_HIGHBITDEPTH
  912|  69.2M|  const int use_highbitdepth = seq_params->use_highbitdepth;
  913|  69.2M|  const aom_bit_depth_t bit_depth = seq_params->bit_depth;
  914|  69.2M|  if (use_highbitdepth) {
  ------------------
  |  Branch (914:7): [True: 57.9M, False: 11.3M]
  ------------------
  915|  57.9M|    uint16_t *dst_shortptr = CONVERT_TO_SHORTPTR(dst);
  ------------------
  |  |   75|  57.9M|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  916|  57.9M|    if (use_filter_type == USE_QUAD) {
  ------------------
  |  Branch (916:9): [True: 0, False: 57.9M]
  ------------------
  917|      0|      switch (params->filter_length) {
  918|       |        // apply 4-tap filtering
  919|      0|        case 4:
  ------------------
  |  Branch (919:9): [True: 0, False: 0]
  ------------------
  920|      0|          aom_highbd_lpf_vertical_4_dual(
  921|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
  922|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
  923|      0|              bit_depth);
  924|      0|          aom_highbd_lpf_vertical_4_dual(
  925|      0|              dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride,
  ------------------
  |  |   40|      0|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  926|      0|              limits->mblim, limits->lim, limits->hev_thr, limits->mblim,
  927|      0|              limits->lim, limits->hev_thr, bit_depth);
  928|      0|          break;
  929|      0|        case 6:  // apply 6-tap filter for chroma plane only
  ------------------
  |  Branch (929:9): [True: 0, False: 0]
  ------------------
  930|      0|          aom_highbd_lpf_vertical_6_dual(
  ------------------
  |  |  895|      0|#define aom_highbd_lpf_vertical_6_dual aom_highbd_lpf_vertical_6_dual_sse2
  ------------------
  931|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
  932|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
  933|      0|              bit_depth);
  934|      0|          aom_highbd_lpf_vertical_6_dual(
  ------------------
  |  |  895|      0|#define aom_highbd_lpf_vertical_6_dual aom_highbd_lpf_vertical_6_dual_sse2
  ------------------
  935|      0|              dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride,
  ------------------
  |  |   40|      0|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  936|      0|              limits->mblim, limits->lim, limits->hev_thr, limits->mblim,
  937|      0|              limits->lim, limits->hev_thr, bit_depth);
  938|      0|          break;
  939|       |        // apply 8-tap filtering
  940|      0|        case 8:
  ------------------
  |  Branch (940:9): [True: 0, False: 0]
  ------------------
  941|      0|          aom_highbd_lpf_vertical_8_dual(
  942|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
  943|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
  944|      0|              bit_depth);
  945|      0|          aom_highbd_lpf_vertical_8_dual(
  946|      0|              dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride,
  ------------------
  |  |   40|      0|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  947|      0|              limits->mblim, limits->lim, limits->hev_thr, limits->mblim,
  948|      0|              limits->lim, limits->hev_thr, bit_depth);
  949|      0|          break;
  950|       |        // apply 14-tap filtering
  951|      0|        case 14:
  ------------------
  |  Branch (951:9): [True: 0, False: 0]
  ------------------
  952|      0|          aom_highbd_lpf_vertical_14_dual(
  953|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
  954|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
  955|      0|              bit_depth);
  956|      0|          aom_highbd_lpf_vertical_14_dual(
  957|      0|              dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride,
  ------------------
  |  |   40|      0|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  958|      0|              limits->mblim, limits->lim, limits->hev_thr, limits->mblim,
  959|      0|              limits->lim, limits->hev_thr, bit_depth);
  960|      0|          break;
  961|       |        // no filtering
  962|      0|        default: break;
  ------------------
  |  Branch (962:9): [True: 0, False: 0]
  ------------------
  963|      0|      }
  964|  57.9M|    } else if (use_filter_type == USE_DUAL) {
  ------------------
  |  Branch (964:16): [True: 0, False: 57.9M]
  ------------------
  965|      0|      switch (params->filter_length) {
  966|       |        // apply 4-tap filtering
  967|      0|        case 4:
  ------------------
  |  Branch (967:9): [True: 0, False: 0]
  ------------------
  968|      0|          aom_highbd_lpf_vertical_4_dual(
  969|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
  970|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
  971|      0|              bit_depth);
  972|      0|          break;
  973|      0|        case 6:  // apply 6-tap filter for chroma plane only
  ------------------
  |  Branch (973:9): [True: 0, False: 0]
  ------------------
  974|      0|          aom_highbd_lpf_vertical_6_dual(
  ------------------
  |  |  895|      0|#define aom_highbd_lpf_vertical_6_dual aom_highbd_lpf_vertical_6_dual_sse2
  ------------------
  975|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
  976|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
  977|      0|              bit_depth);
  978|      0|          break;
  979|       |        // apply 8-tap filtering
  980|      0|        case 8:
  ------------------
  |  Branch (980:9): [True: 0, False: 0]
  ------------------
  981|      0|          aom_highbd_lpf_vertical_8_dual(
  982|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
  983|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
  984|      0|              bit_depth);
  985|      0|          break;
  986|       |        // apply 14-tap filtering
  987|      0|        case 14:
  ------------------
  |  Branch (987:9): [True: 0, False: 0]
  ------------------
  988|      0|          aom_highbd_lpf_vertical_14_dual(
  989|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
  990|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
  991|      0|              bit_depth);
  992|      0|          break;
  993|       |        // no filtering
  994|      0|        default: break;
  ------------------
  |  Branch (994:9): [True: 0, False: 0]
  ------------------
  995|      0|      }
  996|  57.9M|    } else {
  997|  57.9M|      assert(use_filter_type == USE_SINGLE);
  998|  57.6M|      switch (params->filter_length) {
  999|       |        // apply 4-tap filtering
 1000|  36.8M|        case 4:
  ------------------
  |  Branch (1000:9): [True: 36.8M, False: 20.7M]
  ------------------
 1001|  36.8M|          aom_highbd_lpf_vertical_4(dst_shortptr, dst_stride, limits->mblim,
  ------------------
  |  |  882|  36.8M|#define aom_highbd_lpf_vertical_4 aom_highbd_lpf_vertical_4_sse2
  ------------------
 1002|  36.8M|                                    limits->lim, limits->hev_thr, bit_depth);
 1003|  36.8M|          break;
 1004|  5.33M|        case 6:  // apply 6-tap filter for chroma plane only
  ------------------
  |  Branch (1004:9): [True: 5.33M, False: 52.2M]
  ------------------
 1005|  5.33M|          aom_highbd_lpf_vertical_6(dst_shortptr, dst_stride, limits->mblim,
  ------------------
  |  |  891|  5.33M|#define aom_highbd_lpf_vertical_6 aom_highbd_lpf_vertical_6_sse2
  ------------------
 1006|  5.33M|                                    limits->lim, limits->hev_thr, bit_depth);
 1007|  5.33M|          break;
 1008|       |        // apply 8-tap filtering
 1009|   498k|        case 8:
  ------------------
  |  Branch (1009:9): [True: 498k, False: 57.1M]
  ------------------
 1010|   498k|          aom_highbd_lpf_vertical_8(dst_shortptr, dst_stride, limits->mblim,
  ------------------
  |  |  899|   498k|#define aom_highbd_lpf_vertical_8 aom_highbd_lpf_vertical_8_sse2
  ------------------
 1011|   498k|                                    limits->lim, limits->hev_thr, bit_depth);
 1012|   498k|          break;
 1013|       |        // apply 14-tap filtering
 1014|  7.02M|        case 14:
  ------------------
  |  Branch (1014:9): [True: 7.02M, False: 50.6M]
  ------------------
 1015|  7.02M|          aom_highbd_lpf_vertical_14(dst_shortptr, dst_stride, limits->mblim,
  ------------------
  |  |  873|  7.02M|#define aom_highbd_lpf_vertical_14 aom_highbd_lpf_vertical_14_sse2
  ------------------
 1016|  7.02M|                                     limits->lim, limits->hev_thr, bit_depth);
 1017|  7.02M|          break;
 1018|       |        // no filtering
 1019|  14.7M|        default: break;
  ------------------
  |  Branch (1019:9): [True: 14.7M, False: 42.8M]
  ------------------
 1020|  57.6M|      }
 1021|  57.6M|    }
 1022|  46.0M|    return;
 1023|  57.9M|  }
 1024|  11.3M|#endif  // CONFIG_AV1_HIGHBITDEPTH
 1025|  11.3M|  if (use_filter_type == USE_QUAD) {
  ------------------
  |  Branch (1025:7): [True: 0, False: 11.3M]
  ------------------
 1026|       |    // Only one set of loop filter parameters (mblim, lim and hev_thr) is
 1027|       |    // passed as argument to quad loop filter because quad loop filter is
 1028|       |    // called for those cases where all the 4 set of loop filter parameters
 1029|       |    // are equal.
 1030|      0|    switch (params->filter_length) {
 1031|       |      // apply 4-tap filtering
 1032|      0|      case 4:
  ------------------
  |  Branch (1032:7): [True: 0, False: 0]
  ------------------
 1033|      0|        aom_lpf_vertical_4_quad(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 1280|      0|#define aom_lpf_vertical_4_quad aom_lpf_vertical_4_quad_sse2
  ------------------
 1034|      0|                                limits->hev_thr);
 1035|      0|        break;
 1036|      0|      case 6:  // apply 6-tap filter for chroma plane only
  ------------------
  |  Branch (1036:7): [True: 0, False: 0]
  ------------------
 1037|      0|        aom_lpf_vertical_6_quad(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 1292|      0|#define aom_lpf_vertical_6_quad aom_lpf_vertical_6_quad_sse2
  ------------------
 1038|      0|                                limits->hev_thr);
 1039|      0|        break;
 1040|       |      // apply 8-tap filtering
 1041|      0|      case 8:
  ------------------
  |  Branch (1041:7): [True: 0, False: 0]
  ------------------
 1042|      0|        aom_lpf_vertical_8_quad(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 1304|      0|#define aom_lpf_vertical_8_quad aom_lpf_vertical_8_quad_sse2
  ------------------
 1043|      0|                                limits->hev_thr);
 1044|      0|        break;
 1045|       |      // apply 14-tap filtering
 1046|      0|      case 14:
  ------------------
  |  Branch (1046:7): [True: 0, False: 0]
  ------------------
 1047|      0|        aom_lpf_vertical_14_quad(dst, dst_stride, limits->mblim, limits->lim,
 1048|      0|                                 limits->hev_thr);
 1049|      0|        break;
 1050|       |      // no filtering
 1051|      0|      default: break;
  ------------------
  |  Branch (1051:7): [True: 0, False: 0]
  ------------------
 1052|      0|    }
 1053|  11.3M|  } else if (use_filter_type == USE_DUAL) {
  ------------------
  |  Branch (1053:14): [True: 0, False: 11.3M]
  ------------------
 1054|      0|    switch (params->filter_length) {
 1055|       |      // apply 4-tap filtering
 1056|      0|      case 4:
  ------------------
  |  Branch (1056:7): [True: 0, False: 0]
  ------------------
 1057|      0|        aom_lpf_vertical_4_dual(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 1276|      0|#define aom_lpf_vertical_4_dual aom_lpf_vertical_4_dual_sse2
  ------------------
 1058|      0|                                limits->hev_thr, limits->mblim, limits->lim,
 1059|      0|                                limits->hev_thr);
 1060|      0|        break;
 1061|      0|      case 6:  // apply 6-tap filter for chroma plane only
  ------------------
  |  Branch (1061:7): [True: 0, False: 0]
  ------------------
 1062|      0|        aom_lpf_vertical_6_dual(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 1288|      0|#define aom_lpf_vertical_6_dual aom_lpf_vertical_6_dual_sse2
  ------------------
 1063|      0|                                limits->hev_thr, limits->mblim, limits->lim,
 1064|      0|                                limits->hev_thr);
 1065|      0|        break;
 1066|       |      // apply 8-tap filtering
 1067|      0|      case 8:
  ------------------
  |  Branch (1067:7): [True: 0, False: 0]
  ------------------
 1068|      0|        aom_lpf_vertical_8_dual(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 1300|      0|#define aom_lpf_vertical_8_dual aom_lpf_vertical_8_dual_sse2
  ------------------
 1069|      0|                                limits->hev_thr, limits->mblim, limits->lim,
 1070|      0|                                limits->hev_thr);
 1071|      0|        break;
 1072|       |      // apply 14-tap filtering
 1073|      0|      case 14:
  ------------------
  |  Branch (1073:7): [True: 0, False: 0]
  ------------------
 1074|      0|        aom_lpf_vertical_14_dual(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 1263|      0|#define aom_lpf_vertical_14_dual aom_lpf_vertical_14_dual_sse2
  ------------------
 1075|      0|                                 limits->hev_thr, limits->mblim, limits->lim,
 1076|      0|                                 limits->hev_thr);
 1077|      0|        break;
 1078|       |      // no filtering
 1079|      0|      default: break;
  ------------------
  |  Branch (1079:7): [True: 0, False: 0]
  ------------------
 1080|      0|    }
 1081|  11.3M|  } else {
 1082|  11.3M|    assert(use_filter_type == USE_SINGLE);
 1083|  12.1M|    switch (params->filter_length) {
 1084|       |      // apply 4-tap filtering
 1085|   985k|      case 4:
  ------------------
  |  Branch (1085:7): [True: 985k, False: 11.1M]
  ------------------
 1086|   985k|        aom_lpf_vertical_4(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 1272|   985k|#define aom_lpf_vertical_4 aom_lpf_vertical_4_sse2
  ------------------
 1087|   985k|                           limits->hev_thr);
 1088|   985k|        break;
 1089|  5.03M|      case 6:  // apply 6-tap filter for chroma plane only
  ------------------
  |  Branch (1089:7): [True: 5.03M, False: 7.10M]
  ------------------
 1090|  5.03M|        aom_lpf_vertical_6(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 1284|  5.03M|#define aom_lpf_vertical_6 aom_lpf_vertical_6_sse2
  ------------------
 1091|  5.03M|                           limits->hev_thr);
 1092|  5.03M|        break;
 1093|       |      // apply 8-tap filtering
 1094|   541k|      case 8:
  ------------------
  |  Branch (1094:7): [True: 541k, False: 11.5M]
  ------------------
 1095|   541k|        aom_lpf_vertical_8(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 1296|   541k|#define aom_lpf_vertical_8 aom_lpf_vertical_8_sse2
  ------------------
 1096|   541k|                           limits->hev_thr);
 1097|   541k|        break;
 1098|       |      // apply 14-tap filtering
 1099|  5.43M|      case 14:
  ------------------
  |  Branch (1099:7): [True: 5.43M, False: 6.70M]
  ------------------
 1100|  5.43M|        aom_lpf_vertical_14(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 1259|  5.43M|#define aom_lpf_vertical_14 aom_lpf_vertical_14_sse2
  ------------------
 1101|  5.43M|                            limits->hev_thr);
 1102|  5.43M|        break;
 1103|       |      // no filtering
 1104|  1.67M|      default: break;
  ------------------
  |  Branch (1104:7): [True: 1.67M, False: 10.4M]
  ------------------
 1105|  12.1M|    }
 1106|  12.1M|  }
 1107|       |#if !CONFIG_AV1_HIGHBITDEPTH
 1108|       |  (void)seq_params;
 1109|       |#endif  // !CONFIG_AV1_HIGHBITDEPTH
 1110|  11.3M|}
av1_loopfilter.c:filter_horz:
 1511|  81.9M|                               USE_FILTER_TYPE use_filter_type) {
 1512|  81.9M|  const loop_filter_thresh *limits = params->lfthr;
 1513|  81.9M|#if CONFIG_AV1_HIGHBITDEPTH
 1514|  81.9M|  const int use_highbitdepth = seq_params->use_highbitdepth;
 1515|  81.9M|  const aom_bit_depth_t bit_depth = seq_params->bit_depth;
 1516|  81.9M|  if (use_highbitdepth) {
  ------------------
  |  Branch (1516:7): [True: 65.4M, False: 16.4M]
  ------------------
 1517|  65.4M|    uint16_t *dst_shortptr = CONVERT_TO_SHORTPTR(dst);
  ------------------
  |  |   75|  65.4M|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 1518|  65.4M|    if (use_filter_type == USE_QUAD) {
  ------------------
  |  Branch (1518:9): [True: 0, False: 65.4M]
  ------------------
 1519|      0|      switch (params->filter_length) {
 1520|       |        // apply 4-tap filtering
 1521|      0|        case 4:
  ------------------
  |  Branch (1521:9): [True: 0, False: 0]
  ------------------
 1522|      0|          aom_highbd_lpf_horizontal_4_dual(
 1523|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
 1524|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
 1525|      0|              bit_depth);
 1526|      0|          aom_highbd_lpf_horizontal_4_dual(
 1527|      0|              dst_shortptr + (2 * MI_SIZE), dst_stride, limits->mblim,
  ------------------
  |  |   40|      0|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1528|      0|              limits->lim, limits->hev_thr, limits->mblim, limits->lim,
 1529|      0|              limits->hev_thr, bit_depth);
 1530|      0|          break;
 1531|      0|        case 6:  // apply 6-tap filter for chroma plane only
  ------------------
  |  Branch (1531:9): [True: 0, False: 0]
  ------------------
 1532|      0|          aom_highbd_lpf_horizontal_6_dual(
  ------------------
  |  |  860|      0|#define aom_highbd_lpf_horizontal_6_dual aom_highbd_lpf_horizontal_6_dual_sse2
  ------------------
 1533|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
 1534|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
 1535|      0|              bit_depth);
 1536|      0|          aom_highbd_lpf_horizontal_6_dual(
  ------------------
  |  |  860|      0|#define aom_highbd_lpf_horizontal_6_dual aom_highbd_lpf_horizontal_6_dual_sse2
  ------------------
 1537|      0|              dst_shortptr + (2 * MI_SIZE), dst_stride, limits->mblim,
  ------------------
  |  |   40|      0|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1538|      0|              limits->lim, limits->hev_thr, limits->mblim, limits->lim,
 1539|      0|              limits->hev_thr, bit_depth);
 1540|      0|          break;
 1541|       |        // apply 8-tap filtering
 1542|      0|        case 8:
  ------------------
  |  Branch (1542:9): [True: 0, False: 0]
  ------------------
 1543|      0|          aom_highbd_lpf_horizontal_8_dual(
 1544|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
 1545|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
 1546|      0|              bit_depth);
 1547|      0|          aom_highbd_lpf_horizontal_8_dual(
 1548|      0|              dst_shortptr + (2 * MI_SIZE), dst_stride, limits->mblim,
  ------------------
  |  |   40|      0|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1549|      0|              limits->lim, limits->hev_thr, limits->mblim, limits->lim,
 1550|      0|              limits->hev_thr, bit_depth);
 1551|      0|          break;
 1552|       |        // apply 14-tap filtering
 1553|      0|        case 14:
  ------------------
  |  Branch (1553:9): [True: 0, False: 0]
  ------------------
 1554|      0|          aom_highbd_lpf_horizontal_14_dual(
 1555|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
 1556|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
 1557|      0|              bit_depth);
 1558|      0|          aom_highbd_lpf_horizontal_14_dual(
 1559|      0|              dst_shortptr + (2 * MI_SIZE), dst_stride, limits->mblim,
  ------------------
  |  |   40|      0|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1560|      0|              limits->lim, limits->hev_thr, limits->mblim, limits->lim,
 1561|      0|              limits->hev_thr, bit_depth);
 1562|      0|          break;
 1563|       |        // no filtering
 1564|      0|        default: break;
  ------------------
  |  Branch (1564:9): [True: 0, False: 0]
  ------------------
 1565|      0|      }
 1566|  65.4M|    } else if (use_filter_type == USE_DUAL) {
  ------------------
  |  Branch (1566:16): [True: 0, False: 65.4M]
  ------------------
 1567|      0|      switch (params->filter_length) {
 1568|       |        // apply 4-tap filtering
 1569|      0|        case 4:
  ------------------
  |  Branch (1569:9): [True: 0, False: 0]
  ------------------
 1570|      0|          aom_highbd_lpf_horizontal_4_dual(
 1571|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
 1572|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
 1573|      0|              bit_depth);
 1574|      0|          break;
 1575|      0|        case 6:  // apply 6-tap filter for chroma plane only
  ------------------
  |  Branch (1575:9): [True: 0, False: 0]
  ------------------
 1576|      0|          aom_highbd_lpf_horizontal_6_dual(
  ------------------
  |  |  860|      0|#define aom_highbd_lpf_horizontal_6_dual aom_highbd_lpf_horizontal_6_dual_sse2
  ------------------
 1577|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
 1578|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
 1579|      0|              bit_depth);
 1580|      0|          break;
 1581|       |        // apply 8-tap filtering
 1582|      0|        case 8:
  ------------------
  |  Branch (1582:9): [True: 0, False: 0]
  ------------------
 1583|      0|          aom_highbd_lpf_horizontal_8_dual(
 1584|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
 1585|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
 1586|      0|              bit_depth);
 1587|      0|          break;
 1588|       |        // apply 14-tap filtering
 1589|      0|        case 14:
  ------------------
  |  Branch (1589:9): [True: 0, False: 0]
  ------------------
 1590|      0|          aom_highbd_lpf_horizontal_14_dual(
 1591|      0|              dst_shortptr, dst_stride, limits->mblim, limits->lim,
 1592|      0|              limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
 1593|      0|              bit_depth);
 1594|      0|          break;
 1595|       |        // no filtering
 1596|      0|        default: break;
  ------------------
  |  Branch (1596:9): [True: 0, False: 0]
  ------------------
 1597|      0|      }
 1598|  65.4M|    } else {
 1599|  65.4M|      assert(use_filter_type == USE_SINGLE);
 1600|  65.4M|      switch (params->filter_length) {
 1601|       |        // apply 4-tap filtering
 1602|  40.8M|        case 4:
  ------------------
  |  Branch (1602:9): [True: 40.8M, False: 24.6M]
  ------------------
 1603|  40.8M|          aom_highbd_lpf_horizontal_4(dst_shortptr, dst_stride, limits->mblim,
  ------------------
  |  |  847|  40.8M|#define aom_highbd_lpf_horizontal_4 aom_highbd_lpf_horizontal_4_sse2
  ------------------
 1604|  40.8M|                                      limits->lim, limits->hev_thr, bit_depth);
 1605|  40.8M|          break;
 1606|  6.60M|        case 6:  // apply 6-tap filter for chroma plane only
  ------------------
  |  Branch (1606:9): [True: 6.60M, False: 58.8M]
  ------------------
 1607|  6.60M|          aom_highbd_lpf_horizontal_6(dst_shortptr, dst_stride, limits->mblim,
  ------------------
  |  |  856|  6.60M|#define aom_highbd_lpf_horizontal_6 aom_highbd_lpf_horizontal_6_sse2
  ------------------
 1608|  6.60M|                                      limits->lim, limits->hev_thr, bit_depth);
 1609|  6.60M|          break;
 1610|       |        // apply 8-tap filtering
 1611|  3.08M|        case 8:
  ------------------
  |  Branch (1611:9): [True: 3.08M, False: 62.3M]
  ------------------
 1612|  3.08M|          aom_highbd_lpf_horizontal_8(dst_shortptr, dst_stride, limits->mblim,
  ------------------
  |  |  864|  3.08M|#define aom_highbd_lpf_horizontal_8 aom_highbd_lpf_horizontal_8_sse2
  ------------------
 1613|  3.08M|                                      limits->lim, limits->hev_thr, bit_depth);
 1614|  3.08M|          break;
 1615|       |        // apply 14-tap filtering
 1616|  8.63M|        case 14:
  ------------------
  |  Branch (1616:9): [True: 8.63M, False: 56.8M]
  ------------------
 1617|  8.63M|          aom_highbd_lpf_horizontal_14(dst_shortptr, dst_stride, limits->mblim,
  ------------------
  |  |  838|  8.63M|#define aom_highbd_lpf_horizontal_14 aom_highbd_lpf_horizontal_14_sse2
  ------------------
 1618|  8.63M|                                       limits->lim, limits->hev_thr, bit_depth);
 1619|  8.63M|          break;
 1620|       |        // no filtering
 1621|  14.1M|        default: break;
  ------------------
  |  Branch (1621:9): [True: 14.1M, False: 51.2M]
  ------------------
 1622|  65.4M|      }
 1623|  65.4M|    }
 1624|  56.3M|    return;
 1625|  65.4M|  }
 1626|  16.4M|#endif  // CONFIG_AV1_HIGHBITDEPTH
 1627|  16.4M|  if (use_filter_type == USE_QUAD) {
  ------------------
  |  Branch (1627:7): [True: 0, False: 16.4M]
  ------------------
 1628|       |    // Only one set of loop filter parameters (mblim, lim and hev_thr) is
 1629|       |    // passed as argument to quad loop filter because quad loop filter is
 1630|       |    // called for those cases where all the 4 set of loop filter parameters
 1631|       |    // are equal.
 1632|      0|    switch (params->filter_length) {
 1633|       |      // apply 4-tap filtering
 1634|      0|      case 4:
  ------------------
  |  Branch (1634:7): [True: 0, False: 0]
  ------------------
 1635|      0|        aom_lpf_horizontal_4_quad(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 1229|      0|#define aom_lpf_horizontal_4_quad aom_lpf_horizontal_4_quad_sse2
  ------------------
 1636|      0|                                  limits->hev_thr);
 1637|      0|        break;
 1638|      0|      case 6:  // apply 6-tap filter for chroma plane only
  ------------------
  |  Branch (1638:7): [True: 0, False: 0]
  ------------------
 1639|      0|        aom_lpf_horizontal_6_quad(dst, dst_stride, limits->mblim, limits->lim,
 1640|      0|                                  limits->hev_thr);
 1641|      0|        break;
 1642|       |      // apply 8-tap filtering
 1643|      0|      case 8:
  ------------------
  |  Branch (1643:7): [True: 0, False: 0]
  ------------------
 1644|      0|        aom_lpf_horizontal_8_quad(dst, dst_stride, limits->mblim, limits->lim,
 1645|      0|                                  limits->hev_thr);
 1646|      0|        break;
 1647|       |      // apply 14-tap filtering
 1648|      0|      case 14:
  ------------------
  |  Branch (1648:7): [True: 0, False: 0]
  ------------------
 1649|      0|        aom_lpf_horizontal_14_quad(dst, dst_stride, limits->mblim, limits->lim,
 1650|      0|                                   limits->hev_thr);
 1651|      0|        break;
 1652|       |      // no filtering
 1653|      0|      default: break;
  ------------------
  |  Branch (1653:7): [True: 0, False: 0]
  ------------------
 1654|      0|    }
 1655|  16.4M|  } else if (use_filter_type == USE_DUAL) {
  ------------------
  |  Branch (1655:14): [True: 0, False: 16.4M]
  ------------------
 1656|      0|    switch (params->filter_length) {
 1657|       |      // apply 4-tap filtering
 1658|      0|      case 4:
  ------------------
  |  Branch (1658:7): [True: 0, False: 0]
  ------------------
 1659|      0|        aom_lpf_horizontal_4_dual(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 1225|      0|#define aom_lpf_horizontal_4_dual aom_lpf_horizontal_4_dual_sse2
  ------------------
 1660|      0|                                  limits->hev_thr, limits->mblim, limits->lim,
 1661|      0|                                  limits->hev_thr);
 1662|      0|        break;
 1663|      0|      case 6:  // apply 6-tap filter for chroma plane only
  ------------------
  |  Branch (1663:7): [True: 0, False: 0]
  ------------------
 1664|      0|        aom_lpf_horizontal_6_dual(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 1237|      0|#define aom_lpf_horizontal_6_dual aom_lpf_horizontal_6_dual_sse2
  ------------------
 1665|      0|                                  limits->hev_thr, limits->mblim, limits->lim,
 1666|      0|                                  limits->hev_thr);
 1667|      0|        break;
 1668|       |      // apply 8-tap filtering
 1669|      0|      case 8:
  ------------------
  |  Branch (1669:7): [True: 0, False: 0]
  ------------------
 1670|      0|        aom_lpf_horizontal_8_dual(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 1250|      0|#define aom_lpf_horizontal_8_dual aom_lpf_horizontal_8_dual_sse2
  ------------------
 1671|      0|                                  limits->hev_thr, limits->mblim, limits->lim,
 1672|      0|                                  limits->hev_thr);
 1673|      0|        break;
 1674|       |      // apply 14-tap filtering
 1675|      0|      case 14:
  ------------------
  |  Branch (1675:7): [True: 0, False: 0]
  ------------------
 1676|      0|        aom_lpf_horizontal_14_dual(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 1212|      0|#define aom_lpf_horizontal_14_dual aom_lpf_horizontal_14_dual_sse2
  ------------------
 1677|      0|                                   limits->hev_thr, limits->mblim, limits->lim,
 1678|      0|                                   limits->hev_thr);
 1679|      0|        break;
 1680|       |      // no filtering
 1681|      0|      default: break;
  ------------------
  |  Branch (1681:7): [True: 0, False: 0]
  ------------------
 1682|      0|    }
 1683|  16.4M|  } else {
 1684|  16.4M|    assert(use_filter_type == USE_SINGLE);
 1685|  18.1M|    switch (params->filter_length) {
 1686|       |      // apply 4-tap filtering
 1687|  3.18M|      case 4:
  ------------------
  |  Branch (1687:7): [True: 3.18M, False: 14.9M]
  ------------------
 1688|  3.18M|        aom_lpf_horizontal_4(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 1221|  3.18M|#define aom_lpf_horizontal_4 aom_lpf_horizontal_4_sse2
  ------------------
 1689|  3.18M|                             limits->hev_thr);
 1690|  3.18M|        break;
 1691|  6.38M|      case 6:  // apply 6-tap filter for chroma plane only
  ------------------
  |  Branch (1691:7): [True: 6.38M, False: 11.7M]
  ------------------
 1692|  6.38M|        aom_lpf_horizontal_6(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 1233|  6.38M|#define aom_lpf_horizontal_6 aom_lpf_horizontal_6_sse2
  ------------------
 1693|  6.38M|                             limits->hev_thr);
 1694|  6.38M|        break;
 1695|       |      // apply 8-tap filtering
 1696|  2.79M|      case 8:
  ------------------
  |  Branch (1696:7): [True: 2.79M, False: 15.3M]
  ------------------
 1697|  2.79M|        aom_lpf_horizontal_8(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 1246|  2.79M|#define aom_lpf_horizontal_8 aom_lpf_horizontal_8_sse2
  ------------------
 1698|  2.79M|                             limits->hev_thr);
 1699|  2.79M|        break;
 1700|       |      // apply 14-tap filtering
 1701|  6.72M|      case 14:
  ------------------
  |  Branch (1701:7): [True: 6.72M, False: 11.4M]
  ------------------
 1702|  6.72M|        aom_lpf_horizontal_14(dst, dst_stride, limits->mblim, limits->lim,
  ------------------
  |  | 1208|  6.72M|#define aom_lpf_horizontal_14 aom_lpf_horizontal_14_sse2
  ------------------
 1703|  6.72M|                              limits->hev_thr);
 1704|  6.72M|        break;
 1705|       |      // no filtering
 1706|  1.88M|      default: break;
  ------------------
  |  Branch (1706:7): [True: 1.88M, False: 16.2M]
  ------------------
 1707|  18.1M|    }
 1708|  18.1M|  }
 1709|       |#if !CONFIG_AV1_HIGHBITDEPTH
 1710|       |  (void)seq_params;
 1711|       |#endif  // !CONFIG_AV1_HIGHBITDEPTH
 1712|  16.4M|}

av1_rtcd:
   18|  16.1k|void av1_rtcd(void) { aom_once(setup_rtcd_internal); }

av1_inv_txfm2d.c:highbd_clip_pixel_add:
  127|  10.0M|                                             int bd) {
  128|  10.0M|  return clip_pixel_highbd(dest + (int)trans, bd);
  129|  10.0M|}
av1_inv_txfm_ssse3.c:cospi_arr:
   47|  8.72M|static inline const int32_t *cospi_arr(int n) {
   48|  8.72M|  return av1_cospi_arr_data[n - cos_bit_min];
   49|  8.72M|}
av1_inv_txfm_ssse3.c:get_txw_idx:
  242|  3.66M|static inline int get_txw_idx(TX_SIZE tx_size) {
  243|  3.66M|  return tx_size_wide_log2[tx_size] - tx_size_wide_log2[0];
  244|  3.66M|}
av1_inv_txfm_ssse3.c:get_txh_idx:
  245|  3.66M|static inline int get_txh_idx(TX_SIZE tx_size) {
  246|  3.66M|  return tx_size_high_log2[tx_size] - tx_size_high_log2[0];
  247|  3.66M|}
av1_inv_txfm_ssse3.c:get_rect_tx_log_ratio:
  215|  1.36M|static inline int get_rect_tx_log_ratio(int col, int row) {
  216|  1.36M|  if (col == row) return 0;
  ------------------
  |  Branch (216:7): [True: 177k, False: 1.18M]
  ------------------
  217|  1.18M|  if (col > row) {
  ------------------
  |  Branch (217:7): [True: 847k, False: 337k]
  ------------------
  218|   847k|    if (col == row * 2) return 1;
  ------------------
  |  Branch (218:9): [True: 444k, False: 403k]
  ------------------
  219|   403k|    if (col == row * 4) return 2;
  ------------------
  |  Branch (219:9): [True: 403k, False: 18.4E]
  ------------------
  220|  18.4E|    assert(0 && "Unsupported transform size");
  221|  18.4E|  } else {
  222|   337k|    if (row == col * 2) return -1;
  ------------------
  |  Branch (222:9): [True: 247k, False: 90.5k]
  ------------------
  223|  90.6k|    if (row == col * 4) return -2;
  ------------------
  |  Branch (223:9): [True: 90.6k, False: 18.4E]
  ------------------
  224|  18.4E|    assert(0 && "Unsupported transform size");
  225|  18.4E|  }
  226|      0|  return 0;  // Invalid
  227|  1.18M|}
av1_inv_txfm_ssse3.c:sinpi_arr:
   51|  1.07M|static inline const int32_t *sinpi_arr(int n) {
   52|  1.07M|  return av1_sinpi_arr_data[n - cos_bit_min];
   53|  1.07M|}
av1_inv_txfm_ssse3.c:get_flip_cfg:
  169|  3.50M|static inline void get_flip_cfg(TX_TYPE tx_type, int *ud_flip, int *lr_flip) {
  170|  3.50M|  switch (tx_type) {
  171|  1.54M|    case DCT_DCT:
  ------------------
  |  Branch (171:5): [True: 1.54M, False: 1.95M]
  ------------------
  172|  1.77M|    case ADST_DCT:
  ------------------
  |  Branch (172:5): [True: 231k, False: 3.26M]
  ------------------
  173|  2.16M|    case DCT_ADST:
  ------------------
  |  Branch (173:5): [True: 383k, False: 3.11M]
  ------------------
  174|  2.51M|    case ADST_ADST:
  ------------------
  |  Branch (174:5): [True: 356k, False: 3.14M]
  ------------------
  175|  2.51M|      *ud_flip = 0;
  176|  2.51M|      *lr_flip = 0;
  177|  2.51M|      break;
  178|   279k|    case IDTX:
  ------------------
  |  Branch (178:5): [True: 279k, False: 3.22M]
  ------------------
  179|   362k|    case V_DCT:
  ------------------
  |  Branch (179:5): [True: 83.0k, False: 3.41M]
  ------------------
  180|   585k|    case H_DCT:
  ------------------
  |  Branch (180:5): [True: 223k, False: 3.27M]
  ------------------
  181|   619k|    case V_ADST:
  ------------------
  |  Branch (181:5): [True: 33.2k, False: 3.46M]
  ------------------
  182|   681k|    case H_ADST:
  ------------------
  |  Branch (182:5): [True: 62.7k, False: 3.43M]
  ------------------
  183|   681k|      *ud_flip = 0;
  184|   681k|      *lr_flip = 0;
  185|   681k|      break;
  186|  31.1k|    case FLIPADST_DCT:
  ------------------
  |  Branch (186:5): [True: 31.1k, False: 3.46M]
  ------------------
  187|  91.2k|    case FLIPADST_ADST:
  ------------------
  |  Branch (187:5): [True: 60.1k, False: 3.44M]
  ------------------
  188|   117k|    case V_FLIPADST:
  ------------------
  |  Branch (188:5): [True: 26.4k, False: 3.47M]
  ------------------
  189|   117k|      *ud_flip = 1;
  190|   117k|      *lr_flip = 0;
  191|   117k|      break;
  192|  59.9k|    case DCT_FLIPADST:
  ------------------
  |  Branch (192:5): [True: 59.9k, False: 3.44M]
  ------------------
  193|   128k|    case ADST_FLIPADST:
  ------------------
  |  Branch (193:5): [True: 68.8k, False: 3.43M]
  ------------------
  194|   164k|    case H_FLIPADST:
  ------------------
  |  Branch (194:5): [True: 35.6k, False: 3.46M]
  ------------------
  195|   164k|      *ud_flip = 0;
  196|   164k|      *lr_flip = 1;
  197|   164k|      break;
  198|  17.7k|    case FLIPADST_FLIPADST:
  ------------------
  |  Branch (198:5): [True: 17.7k, False: 3.48M]
  ------------------
  199|  17.7k|      *ud_flip = 1;
  200|  17.7k|      *lr_flip = 1;
  201|  17.7k|      break;
  202|      0|    default:
  ------------------
  |  Branch (202:5): [True: 0, False: 3.50M]
  ------------------
  203|      0|      *ud_flip = 0;
  204|      0|      *lr_flip = 0;
  205|      0|      assert(0);
  206|  3.50M|  }
  207|  3.50M|}
highbd_inv_txfm_sse4.c:cospi_arr:
   47|  7.19M|static inline const int32_t *cospi_arr(int n) {
   48|  7.19M|  return av1_cospi_arr_data[n - cos_bit_min];
   49|  7.19M|}
highbd_inv_txfm_sse4.c:sinpi_arr:
   51|  3.23M|static inline const int32_t *sinpi_arr(int n) {
   52|  3.23M|  return av1_sinpi_arr_data[n - cos_bit_min];
   53|  3.23M|}
highbd_inv_txfm_sse4.c:get_txw_idx:
  242|  3.15M|static inline int get_txw_idx(TX_SIZE tx_size) {
  243|  3.15M|  return tx_size_wide_log2[tx_size] - tx_size_wide_log2[0];
  244|  3.15M|}
highbd_inv_txfm_sse4.c:get_txh_idx:
  245|  3.15M|static inline int get_txh_idx(TX_SIZE tx_size) {
  246|  3.15M|  return tx_size_high_log2[tx_size] - tx_size_high_log2[0];
  247|  3.15M|}
highbd_inv_txfm_sse4.c:get_rect_tx_log_ratio:
  215|   620k|static inline int get_rect_tx_log_ratio(int col, int row) {
  216|   620k|  if (col == row) return 0;
  ------------------
  |  Branch (216:7): [True: 422k, False: 197k]
  ------------------
  217|   197k|  if (col > row) {
  ------------------
  |  Branch (217:7): [True: 126k, False: 70.9k]
  ------------------
  218|   126k|    if (col == row * 2) return 1;
  ------------------
  |  Branch (218:9): [True: 117k, False: 9.85k]
  ------------------
  219|  9.85k|    if (col == row * 4) return 2;
  ------------------
  |  Branch (219:9): [True: 9.85k, False: 0]
  ------------------
  220|      0|    assert(0 && "Unsupported transform size");
  221|  70.9k|  } else {
  222|  70.9k|    if (row == col * 2) return -1;
  ------------------
  |  Branch (222:9): [True: 64.5k, False: 6.37k]
  ------------------
  223|  6.37k|    if (row == col * 4) return -2;
  ------------------
  |  Branch (223:9): [True: 6.37k, False: 0]
  ------------------
  224|      0|    assert(0 && "Unsupported transform size");
  225|      0|  }
  226|      0|  return 0;  // Invalid
  227|   197k|}
highbd_inv_txfm_sse4.c:get_flip_cfg:
  169|  2.77M|static inline void get_flip_cfg(TX_TYPE tx_type, int *ud_flip, int *lr_flip) {
  170|  2.77M|  switch (tx_type) {
  171|   744k|    case DCT_DCT:
  ------------------
  |  Branch (171:5): [True: 744k, False: 2.02M]
  ------------------
  172|   986k|    case ADST_DCT:
  ------------------
  |  Branch (172:5): [True: 241k, False: 2.53M]
  ------------------
  173|  1.28M|    case DCT_ADST:
  ------------------
  |  Branch (173:5): [True: 301k, False: 2.47M]
  ------------------
  174|  1.62M|    case ADST_ADST:
  ------------------
  |  Branch (174:5): [True: 337k, False: 2.43M]
  ------------------
  175|  1.62M|      *ud_flip = 0;
  176|  1.62M|      *lr_flip = 0;
  177|  1.62M|      break;
  178|   363k|    case IDTX:
  ------------------
  |  Branch (178:5): [True: 363k, False: 2.41M]
  ------------------
  179|   423k|    case V_DCT:
  ------------------
  |  Branch (179:5): [True: 60.2k, False: 2.71M]
  ------------------
  180|   664k|    case H_DCT:
  ------------------
  |  Branch (180:5): [True: 240k, False: 2.53M]
  ------------------
  181|   734k|    case V_ADST:
  ------------------
  |  Branch (181:5): [True: 69.8k, False: 2.70M]
  ------------------
  182|   773k|    case H_ADST:
  ------------------
  |  Branch (182:5): [True: 39.3k, False: 2.73M]
  ------------------
  183|   773k|      *ud_flip = 0;
  184|   773k|      *lr_flip = 0;
  185|   773k|      break;
  186|  14.2k|    case FLIPADST_DCT:
  ------------------
  |  Branch (186:5): [True: 14.2k, False: 2.75M]
  ------------------
  187|  82.1k|    case FLIPADST_ADST:
  ------------------
  |  Branch (187:5): [True: 67.8k, False: 2.70M]
  ------------------
  188|   115k|    case V_FLIPADST:
  ------------------
  |  Branch (188:5): [True: 33.3k, False: 2.74M]
  ------------------
  189|   115k|      *ud_flip = 1;
  190|   115k|      *lr_flip = 0;
  191|   115k|      break;
  192|  96.7k|    case DCT_FLIPADST:
  ------------------
  |  Branch (192:5): [True: 96.7k, False: 2.67M]
  ------------------
  193|   113k|    case ADST_FLIPADST:
  ------------------
  |  Branch (193:5): [True: 17.2k, False: 2.75M]
  ------------------
  194|   190k|    case H_FLIPADST:
  ------------------
  |  Branch (194:5): [True: 76.8k, False: 2.69M]
  ------------------
  195|   190k|      *ud_flip = 0;
  196|   190k|      *lr_flip = 1;
  197|   190k|      break;
  198|  68.4k|    case FLIPADST_FLIPADST:
  ------------------
  |  Branch (198:5): [True: 68.4k, False: 2.70M]
  ------------------
  199|  68.4k|      *ud_flip = 1;
  200|  68.4k|      *lr_flip = 1;
  201|  68.4k|      break;
  202|      0|    default:
  ------------------
  |  Branch (202:5): [True: 0, False: 2.77M]
  ------------------
  203|      0|      *ud_flip = 0;
  204|      0|      *lr_flip = 0;
  205|      0|      assert(0);
  206|  2.77M|  }
  207|  2.77M|}
av1_inv_txfm_avx2.c:get_flip_cfg:
  169|  2.32M|static inline void get_flip_cfg(TX_TYPE tx_type, int *ud_flip, int *lr_flip) {
  170|  2.32M|  switch (tx_type) {
  171|  1.47M|    case DCT_DCT:
  ------------------
  |  Branch (171:5): [True: 1.47M, False: 849k]
  ------------------
  172|  1.65M|    case ADST_DCT:
  ------------------
  |  Branch (172:5): [True: 177k, False: 2.15M]
  ------------------
  173|  1.96M|    case DCT_ADST:
  ------------------
  |  Branch (173:5): [True: 312k, False: 2.01M]
  ------------------
  174|  2.17M|    case ADST_ADST:
  ------------------
  |  Branch (174:5): [True: 209k, False: 2.11M]
  ------------------
  175|  2.17M|      *ud_flip = 0;
  176|  2.17M|      *lr_flip = 0;
  177|  2.17M|      break;
  178|      0|    case IDTX:
  ------------------
  |  Branch (178:5): [True: 0, False: 2.32M]
  ------------------
  179|  5.30k|    case V_DCT:
  ------------------
  |  Branch (179:5): [True: 5.30k, False: 2.32M]
  ------------------
  180|  23.6k|    case H_DCT:
  ------------------
  |  Branch (180:5): [True: 18.3k, False: 2.31M]
  ------------------
  181|  23.6k|    case V_ADST:
  ------------------
  |  Branch (181:5): [True: 0, False: 2.32M]
  ------------------
  182|  23.6k|    case H_ADST:
  ------------------
  |  Branch (182:5): [True: 0, False: 2.32M]
  ------------------
  183|  23.6k|      *ud_flip = 0;
  184|  23.6k|      *lr_flip = 0;
  185|  23.6k|      break;
  186|  28.6k|    case FLIPADST_DCT:
  ------------------
  |  Branch (186:5): [True: 28.6k, False: 2.30M]
  ------------------
  187|  59.6k|    case FLIPADST_ADST:
  ------------------
  |  Branch (187:5): [True: 30.9k, False: 2.29M]
  ------------------
  188|  59.6k|    case V_FLIPADST:
  ------------------
  |  Branch (188:5): [True: 0, False: 2.32M]
  ------------------
  189|  59.6k|      *ud_flip = 1;
  190|  59.6k|      *lr_flip = 0;
  191|  59.6k|      break;
  192|  23.3k|    case DCT_FLIPADST:
  ------------------
  |  Branch (192:5): [True: 23.3k, False: 2.30M]
  ------------------
  193|  52.1k|    case ADST_FLIPADST:
  ------------------
  |  Branch (193:5): [True: 28.8k, False: 2.29M]
  ------------------
  194|  52.1k|    case H_FLIPADST:
  ------------------
  |  Branch (194:5): [True: 0, False: 2.32M]
  ------------------
  195|  52.1k|      *ud_flip = 0;
  196|  52.1k|      *lr_flip = 1;
  197|  52.1k|      break;
  198|  15.2k|    case FLIPADST_FLIPADST:
  ------------------
  |  Branch (198:5): [True: 15.2k, False: 2.31M]
  ------------------
  199|  15.2k|      *ud_flip = 1;
  200|  15.2k|      *lr_flip = 1;
  201|  15.2k|      break;
  202|      0|    default:
  ------------------
  |  Branch (202:5): [True: 0, False: 2.32M]
  ------------------
  203|      0|      *ud_flip = 0;
  204|      0|      *lr_flip = 0;
  205|      0|      assert(0);
  206|  2.32M|  }
  207|  2.32M|}
av1_inv_txfm_avx2.c:get_txw_idx:
  242|  1.57M|static inline int get_txw_idx(TX_SIZE tx_size) {
  243|  1.57M|  return tx_size_wide_log2[tx_size] - tx_size_wide_log2[0];
  244|  1.57M|}
av1_inv_txfm_avx2.c:get_txh_idx:
  245|  1.57M|static inline int get_txh_idx(TX_SIZE tx_size) {
  246|  1.57M|  return tx_size_high_log2[tx_size] - tx_size_high_log2[0];
  247|  1.57M|}
av1_inv_txfm_avx2.c:get_rect_tx_log_ratio:
  215|  1.57M|static inline int get_rect_tx_log_ratio(int col, int row) {
  216|  1.57M|  if (col == row) return 0;
  ------------------
  |  Branch (216:7): [True: 1.15M, False: 418k]
  ------------------
  217|   418k|  if (col > row) {
  ------------------
  |  Branch (217:7): [True: 318k, False: 99.2k]
  ------------------
  218|   318k|    if (col == row * 2) return 1;
  ------------------
  |  Branch (218:9): [True: 178k, False: 140k]
  ------------------
  219|   140k|    if (col == row * 4) return 2;
  ------------------
  |  Branch (219:9): [True: 140k, False: 18.4E]
  ------------------
  220|  18.4E|    assert(0 && "Unsupported transform size");
  221|  18.4E|  } else {
  222|  99.2k|    if (row == col * 2) return -1;
  ------------------
  |  Branch (222:9): [True: 83.7k, False: 15.5k]
  ------------------
  223|  15.7k|    if (row == col * 4) return -2;
  ------------------
  |  Branch (223:9): [True: 15.7k, False: 18.4E]
  ------------------
  224|  18.4E|    assert(0 && "Unsupported transform size");
  225|  18.4E|  }
  226|      0|  return 0;  // Invalid
  227|   418k|}
av1_inv_txfm_avx2.c:cospi_arr:
   47|  4.34M|static inline const int32_t *cospi_arr(int n) {
   48|  4.34M|  return av1_cospi_arr_data[n - cos_bit_min];
   49|  4.34M|}
highbd_inv_txfm_avx2.c:get_txw_idx:
  242|  4.56M|static inline int get_txw_idx(TX_SIZE tx_size) {
  243|  4.56M|  return tx_size_wide_log2[tx_size] - tx_size_wide_log2[0];
  244|  4.56M|}
highbd_inv_txfm_avx2.c:get_txh_idx:
  245|  4.56M|static inline int get_txh_idx(TX_SIZE tx_size) {
  246|  4.56M|  return tx_size_high_log2[tx_size] - tx_size_high_log2[0];
  247|  4.56M|}
highbd_inv_txfm_avx2.c:get_rect_tx_log_ratio:
  215|  4.56M|static inline int get_rect_tx_log_ratio(int col, int row) {
  216|  4.56M|  if (col == row) return 0;
  ------------------
  |  Branch (216:7): [True: 2.69M, False: 1.87M]
  ------------------
  217|  1.87M|  if (col > row) {
  ------------------
  |  Branch (217:7): [True: 1.38M, False: 484k]
  ------------------
  218|  1.38M|    if (col == row * 2) return 1;
  ------------------
  |  Branch (218:9): [True: 687k, False: 699k]
  ------------------
  219|   699k|    if (col == row * 4) return 2;
  ------------------
  |  Branch (219:9): [True: 699k, False: 18.4E]
  ------------------
  220|  18.4E|    assert(0 && "Unsupported transform size");
  221|  18.4E|  } else {
  222|   484k|    if (row == col * 2) return -1;
  ------------------
  |  Branch (222:9): [True: 389k, False: 94.7k]
  ------------------
  223|  95.4k|    if (row == col * 4) return -2;
  ------------------
  |  Branch (223:9): [True: 95.4k, False: 18.4E]
  ------------------
  224|  18.4E|    assert(0 && "Unsupported transform size");
  225|  18.4E|  }
  226|      0|  return 0;  // Invalid
  227|  1.87M|}
highbd_inv_txfm_avx2.c:cospi_arr:
   47|  16.5M|static inline const int32_t *cospi_arr(int n) {
   48|  16.5M|  return av1_cospi_arr_data[n - cos_bit_min];
   49|  16.5M|}
highbd_inv_txfm_avx2.c:get_flip_cfg:
  169|  4.56M|static inline void get_flip_cfg(TX_TYPE tx_type, int *ud_flip, int *lr_flip) {
  170|  4.56M|  switch (tx_type) {
  171|  2.87M|    case DCT_DCT:
  ------------------
  |  Branch (171:5): [True: 2.87M, False: 1.69M]
  ------------------
  172|  3.26M|    case ADST_DCT:
  ------------------
  |  Branch (172:5): [True: 391k, False: 4.17M]
  ------------------
  173|  3.87M|    case DCT_ADST:
  ------------------
  |  Branch (173:5): [True: 609k, False: 3.95M]
  ------------------
  174|  4.31M|    case ADST_ADST:
  ------------------
  |  Branch (174:5): [True: 443k, False: 4.12M]
  ------------------
  175|  4.31M|      *ud_flip = 0;
  176|  4.31M|      *lr_flip = 0;
  177|  4.31M|      break;
  178|      0|    case IDTX:
  ------------------
  |  Branch (178:5): [True: 0, False: 4.56M]
  ------------------
  179|      0|    case V_DCT:
  ------------------
  |  Branch (179:5): [True: 0, False: 4.56M]
  ------------------
  180|      0|    case H_DCT:
  ------------------
  |  Branch (180:5): [True: 0, False: 4.56M]
  ------------------
  181|      0|    case V_ADST:
  ------------------
  |  Branch (181:5): [True: 0, False: 4.56M]
  ------------------
  182|      0|    case H_ADST:
  ------------------
  |  Branch (182:5): [True: 0, False: 4.56M]
  ------------------
  183|      0|      *ud_flip = 0;
  184|      0|      *lr_flip = 0;
  185|      0|      break;
  186|  35.7k|    case FLIPADST_DCT:
  ------------------
  |  Branch (186:5): [True: 35.7k, False: 4.53M]
  ------------------
  187|   102k|    case FLIPADST_ADST:
  ------------------
  |  Branch (187:5): [True: 66.6k, False: 4.50M]
  ------------------
  188|   102k|    case V_FLIPADST:
  ------------------
  |  Branch (188:5): [True: 0, False: 4.56M]
  ------------------
  189|   102k|      *ud_flip = 1;
  190|   102k|      *lr_flip = 0;
  191|   102k|      break;
  192|  87.8k|    case DCT_FLIPADST:
  ------------------
  |  Branch (192:5): [True: 87.8k, False: 4.47M]
  ------------------
  193|   117k|    case ADST_FLIPADST:
  ------------------
  |  Branch (193:5): [True: 29.6k, False: 4.53M]
  ------------------
  194|   117k|    case H_FLIPADST:
  ------------------
  |  Branch (194:5): [True: 0, False: 4.56M]
  ------------------
  195|   117k|      *ud_flip = 0;
  196|   117k|      *lr_flip = 1;
  197|   117k|      break;
  198|  30.9k|    case FLIPADST_FLIPADST:
  ------------------
  |  Branch (198:5): [True: 30.9k, False: 4.53M]
  ------------------
  199|  30.9k|      *ud_flip = 1;
  200|  30.9k|      *lr_flip = 1;
  201|  30.9k|      break;
  202|      0|    default:
  ------------------
  |  Branch (202:5): [True: 0, False: 4.56M]
  ------------------
  203|      0|      *ud_flip = 0;
  204|      0|      *lr_flip = 0;
  205|      0|      assert(0);
  206|  4.56M|  }
  207|  4.56M|}

av1_left_block_mode:
   17|  9.19M|PREDICTION_MODE av1_left_block_mode(const MB_MODE_INFO *left_mi) {
   18|  9.19M|  if (!left_mi) return DC_PRED;
  ------------------
  |  Branch (18:7): [True: 362k, False: 8.82M]
  ------------------
   19|  8.82M|  assert(!is_inter_block(left_mi) || is_intrabc_block(left_mi));
   20|  8.82M|  return left_mi->mode;
   21|  8.82M|}
av1_above_block_mode:
   23|  9.19M|PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi) {
   24|  9.19M|  if (!above_mi) return DC_PRED;
  ------------------
  |  Branch (24:7): [True: 408k, False: 8.78M]
  ------------------
   25|  8.78M|  assert(!is_inter_block(above_mi) || is_intrabc_block(above_mi));
   26|  8.78M|  return above_mi->mode;
   27|  8.78M|}
av1_set_entropy_contexts:
   32|  35.9M|                              int has_eob, int aoff, int loff) {
   33|  35.9M|  ENTROPY_CONTEXT *const a = pd->above_entropy_context + aoff;
   34|  35.9M|  ENTROPY_CONTEXT *const l = pd->left_entropy_context + loff;
   35|  35.9M|  const int txs_wide = tx_size_wide_unit[tx_size];
   36|  35.9M|  const int txs_high = tx_size_high_unit[tx_size];
   37|       |
   38|       |  // above
   39|  35.9M|  if (has_eob && xd->mb_to_right_edge < 0) {
  ------------------
  |  Branch (39:7): [True: 21.6M, False: 14.2M]
  |  Branch (39:18): [True: 1.28M, False: 20.4M]
  ------------------
   40|  1.28M|    const int blocks_wide = max_block_wide(xd, plane_bsize, plane);
   41|  1.28M|    const int above_contexts = AOMMIN(txs_wide, blocks_wide - aoff);
  ------------------
  |  |   34|  1.28M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.11M, False: 175k]
  |  |  ------------------
  ------------------
   42|  1.28M|    memset(a, has_eob, sizeof(*a) * above_contexts);
   43|  1.28M|    memset(a + above_contexts, 0, sizeof(*a) * (txs_wide - above_contexts));
   44|  34.6M|  } else {
   45|  34.6M|    memset(a, has_eob, sizeof(*a) * txs_wide);
   46|  34.6M|  }
   47|       |
   48|       |  // left
   49|  35.9M|  if (has_eob && xd->mb_to_bottom_edge < 0) {
  ------------------
  |  Branch (49:7): [True: 21.5M, False: 14.4M]
  |  Branch (49:18): [True: 787k, False: 20.7M]
  ------------------
   50|   787k|    const int blocks_high = max_block_high(xd, plane_bsize, plane);
   51|   787k|    const int left_contexts = AOMMIN(txs_high, blocks_high - loff);
  ------------------
  |  |   34|   787k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 633k, False: 153k]
  |  |  ------------------
  ------------------
   52|   787k|    memset(l, has_eob, sizeof(*l) * left_contexts);
   53|   787k|    memset(l + left_contexts, 0, sizeof(*l) * (txs_high - left_contexts));
   54|  35.1M|  } else {
   55|  35.1M|    memset(l, has_eob, sizeof(*l) * txs_high);
   56|  35.1M|  }
   57|  35.9M|}
av1_reset_entropy_context:
   59|  6.08M|                               const int num_planes) {
   60|  6.08M|  assert(bsize < BLOCK_SIZES_ALL);
   61|  6.08M|  const int nplanes = 1 + (num_planes - 1) * xd->is_chroma_ref;
   62|  22.8M|  for (int i = 0; i < nplanes; i++) {
  ------------------
  |  Branch (62:19): [True: 16.7M, False: 6.08M]
  ------------------
   63|  16.7M|    struct macroblockd_plane *const pd = &xd->plane[i];
   64|  16.7M|    const BLOCK_SIZE plane_bsize =
   65|  16.7M|        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
   66|  16.7M|    const int txs_wide = mi_size_wide[plane_bsize];
   67|  16.7M|    const int txs_high = mi_size_high[plane_bsize];
   68|  16.7M|    memset(pd->above_entropy_context, 0, sizeof(ENTROPY_CONTEXT) * txs_wide);
   69|  16.7M|    memset(pd->left_entropy_context, 0, sizeof(ENTROPY_CONTEXT) * txs_high);
   70|  16.7M|  }
   71|  6.08M|}
av1_reset_loop_filter_delta:
   73|   168k|void av1_reset_loop_filter_delta(MACROBLOCKD *xd, int num_planes) {
   74|   168k|  xd->delta_lf_from_base = 0;
   75|   168k|  const int frame_lf_count =
   76|   168k|      num_planes > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
  ------------------
  |  |   72|   149k|#define FRAME_LF_COUNT 4
  ------------------
                    num_planes > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
  ------------------
  |  |   72|  18.9k|#define FRAME_LF_COUNT 4
  ------------------
  |  Branch (76:7): [True: 149k, False: 18.9k]
  ------------------
   77|   806k|  for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) xd->delta_lf[lf_id] = 0;
  ------------------
  |  Branch (77:23): [True: 637k, False: 168k]
  ------------------
   78|   168k|}
av1_reset_loop_restoration:
   80|   162k|void av1_reset_loop_restoration(MACROBLOCKD *xd, const int num_planes) {
   81|   613k|  for (int p = 0; p < num_planes; ++p) {
  ------------------
  |  Branch (81:19): [True: 450k, False: 162k]
  ------------------
   82|   450k|    set_default_wiener(xd->wiener_info + p);
   83|   450k|    set_default_sgrproj(xd->sgrproj_info + p);
   84|   450k|  }
   85|   162k|}
av1_setup_block_planes:
   88|   175k|                            const int num_planes) {
   89|   175k|  int i;
   90|       |
   91|   637k|  for (i = 0; i < num_planes; i++) {
  ------------------
  |  Branch (91:15): [True: 462k, False: 175k]
  ------------------
   92|   462k|    xd->plane[i].plane_type = get_plane_type(i);
   93|   462k|    xd->plane[i].subsampling_x = i ? ss_x : 0;
  ------------------
  |  Branch (93:34): [True: 286k, False: 175k]
  ------------------
   94|   462k|    xd->plane[i].subsampling_y = i ? ss_y : 0;
  ------------------
  |  Branch (94:34): [True: 286k, False: 175k]
  ------------------
   95|   462k|  }
   96|   240k|  for (i = num_planes; i < MAX_MB_PLANE; i++) {
  ------------------
  |  |   36|   240k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (96:24): [True: 64.9k, False: 175k]
  ------------------
   97|  64.9k|    xd->plane[i].subsampling_x = 1;
   98|  64.9k|    xd->plane[i].subsampling_y = 1;
   99|  64.9k|  }
  100|   175k|}

decodeframe.c:get_plane_type:
 1592|   121M|static inline PLANE_TYPE get_plane_type(int plane) {
 1593|   121M|  return (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
  ------------------
  |  Branch (1593:10): [True: 42.9M, False: 78.7M]
  ------------------
 1594|   121M|}
decodeframe.c:av1_get_tx_type:
 1281|  19.3M|                                      int reduced_tx_set) {
 1282|  19.3M|  const MB_MODE_INFO *const mbmi = xd->mi[0];
 1283|  19.3M|  if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32) {
  ------------------
  |  Branch (1283:7): [True: 1.11M, False: 18.2M]
  |  Branch (1283:41): [True: 657k, False: 17.5M]
  ------------------
 1284|  1.77M|    return DCT_DCT;
 1285|  1.77M|  }
 1286|       |
 1287|  17.5M|  TX_TYPE tx_type;
 1288|  17.5M|  if (plane_type == PLANE_TYPE_Y) {
  ------------------
  |  Branch (1288:7): [True: 8.57M, False: 8.98M]
  ------------------
 1289|  8.57M|    tx_type = xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col];
 1290|  8.98M|  } else {
 1291|  8.98M|    if (is_inter_block(mbmi)) {
  ------------------
  |  Branch (1291:9): [True: 5.19M, False: 3.79M]
  ------------------
 1292|       |      // scale back to y plane's coordinate
 1293|  5.19M|      const struct macroblockd_plane *const pd = &xd->plane[plane_type];
 1294|  5.19M|      blk_row <<= pd->subsampling_y;
 1295|  5.19M|      blk_col <<= pd->subsampling_x;
 1296|  5.19M|      tx_type = xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col];
 1297|  5.19M|    } else {
 1298|       |      // In intra mode, uv planes don't share the same prediction mode as y
 1299|       |      // plane, so the tx_type should not be shared
 1300|  3.79M|      tx_type = intra_mode_to_tx_type(mbmi, PLANE_TYPE_UV);
 1301|  3.79M|    }
 1302|  8.98M|    const TxSetType tx_set_type =
 1303|  8.98M|        av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi), reduced_tx_set);
 1304|  8.98M|    if (!av1_ext_tx_used[tx_set_type][tx_type]) tx_type = DCT_DCT;
  ------------------
  |  Branch (1304:9): [True: 246k, False: 8.74M]
  ------------------
 1305|  8.98M|  }
 1306|  17.5M|  assert(tx_type < TX_TYPES);
 1307|  17.5M|  assert(av1_ext_tx_used[av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi),
 1308|  17.5M|                                                 reduced_tx_set)][tx_type]);
 1309|  17.5M|  return tx_type;
 1310|  17.5M|}
decodeframe.c:is_inter_block:
  372|   127M|static inline int is_inter_block(const MB_MODE_INFO *mbmi) {
  373|   127M|  return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME;
  ------------------
  |  Branch (373:10): [True: 429k, False: 127M]
  |  Branch (373:36): [True: 39.6M, False: 87.3M]
  ------------------
  374|   127M|}
decodeframe.c:is_intrabc_block:
  345|   168M|static inline int is_intrabc_block(const MB_MODE_INFO *mbmi) {
  346|   168M|  return mbmi->use_intrabc;
  347|   168M|}
decodeframe.c:intra_mode_to_tx_type:
 1003|  3.79M|                                     PLANE_TYPE plane_type) {
 1004|  3.79M|  static const TX_TYPE _intra_mode_to_tx_type[INTRA_MODES] = {
 1005|  3.79M|    DCT_DCT,    // DC_PRED
 1006|  3.79M|    ADST_DCT,   // V_PRED
 1007|  3.79M|    DCT_ADST,   // H_PRED
 1008|  3.79M|    DCT_DCT,    // D45_PRED
 1009|  3.79M|    ADST_ADST,  // D135_PRED
 1010|  3.79M|    ADST_DCT,   // D113_PRED
 1011|  3.79M|    DCT_ADST,   // D157_PRED
 1012|  3.79M|    DCT_ADST,   // D203_PRED
 1013|  3.79M|    ADST_DCT,   // D67_PRED
 1014|  3.79M|    ADST_ADST,  // SMOOTH_PRED
 1015|  3.79M|    ADST_DCT,   // SMOOTH_V_PRED
 1016|  3.79M|    DCT_ADST,   // SMOOTH_H_PRED
 1017|  3.79M|    ADST_ADST,  // PAETH_PRED
 1018|  3.79M|  };
 1019|  3.79M|  const PREDICTION_MODE mode =
 1020|  3.79M|      (plane_type == PLANE_TYPE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode);
  ------------------
  |  Branch (1020:7): [True: 0, False: 3.79M]
  ------------------
 1021|  3.79M|  assert(mode < INTRA_MODES);
 1022|  3.79M|  return _intra_mode_to_tx_type[mode];
 1023|  3.79M|}
decodeframe.c:get_uv_mode:
  349|  3.79M|static inline PREDICTION_MODE get_uv_mode(UV_PREDICTION_MODE mode) {
  350|  3.79M|  assert(mode < UV_INTRA_MODES);
  351|  3.79M|  static const PREDICTION_MODE uv2y[] = {
  352|  3.79M|    DC_PRED,        // UV_DC_PRED
  353|  3.79M|    V_PRED,         // UV_V_PRED
  354|  3.79M|    H_PRED,         // UV_H_PRED
  355|  3.79M|    D45_PRED,       // UV_D45_PRED
  356|  3.79M|    D135_PRED,      // UV_D135_PRED
  357|  3.79M|    D113_PRED,      // UV_D113_PRED
  358|  3.79M|    D157_PRED,      // UV_D157_PRED
  359|  3.79M|    D203_PRED,      // UV_D203_PRED
  360|  3.79M|    D67_PRED,       // UV_D67_PRED
  361|  3.79M|    SMOOTH_PRED,    // UV_SMOOTH_PRED
  362|  3.79M|    SMOOTH_V_PRED,  // UV_SMOOTH_V_PRED
  363|  3.79M|    SMOOTH_H_PRED,  // UV_SMOOTH_H_PRED
  364|  3.79M|    PAETH_PRED,     // UV_PAETH_PRED
  365|  3.79M|    DC_PRED,        // UV_CFL_PRED
  366|  3.79M|    INTRA_INVALID,  // UV_INTRA_MODES
  367|  3.79M|    INTRA_INVALID,  // UV_MODE_INVALID
  368|  3.79M|  };
  369|  3.79M|  return uv2y[mode];
  370|  3.79M|}
decodeframe.c:av1_get_ext_tx_set_type:
 1098|  26.5M|                                                int use_reduced_set) {
 1099|  26.5M|  const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
 1100|  26.5M|  if (tx_size_sqr_up > TX_32X32) return EXT_TX_SET_DCTONLY;
  ------------------
  |  Branch (1100:7): [True: 0, False: 26.5M]
  ------------------
 1101|  26.5M|  if (tx_size_sqr_up == TX_32X32)
  ------------------
  |  Branch (1101:7): [True: 3.61M, False: 22.9M]
  ------------------
 1102|  3.61M|    return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DCTONLY;
  ------------------
  |  Branch (1102:12): [True: 1.14M, False: 2.47M]
  ------------------
 1103|  22.9M|  if (use_reduced_set)
  ------------------
  |  Branch (1103:7): [True: 6.32M, False: 16.6M]
  ------------------
 1104|  6.32M|    return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DTT4_IDTX;
  ------------------
  |  Branch (1104:12): [True: 3.87M, False: 2.44M]
  ------------------
 1105|  16.6M|  const TX_SIZE tx_size_sqr = txsize_sqr_map[tx_size];
 1106|  16.6M|  return av1_ext_tx_set_lookup[is_inter][tx_size_sqr == TX_16X16];
 1107|  22.9M|}
decodeframe.c:has_second_ref:
  376|  23.0M|static inline int has_second_ref(const MB_MODE_INFO *mbmi) {
  377|  23.0M|  return mbmi->ref_frame[1] > INTRA_FRAME;
  378|  23.0M|}
decodeframe.c:get_plane_block_size:
 1188|  91.9M|                                              int subsampling_y) {
 1189|  91.9M|  assert(bsize < BLOCK_SIZES_ALL);
 1190|  91.9M|  assert(subsampling_x >= 0 && subsampling_x < 2);
 1191|  91.9M|  assert(subsampling_y >= 0 && subsampling_y < 2);
 1192|  91.9M|  return av1_ss_size_lookup[bsize][subsampling_x][subsampling_y];
 1193|  91.9M|}
decodeframe.c:is_cur_buf_hbd:
  932|  17.0M|static inline int is_cur_buf_hbd(const MACROBLOCKD *xd) {
  933|  17.0M|#if CONFIG_AV1_HIGHBITDEPTH
  934|  17.0M|  return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
  ------------------
  |  |  142|  17.0M|#define YV12_FLAG_HIGHBITDEPTH 8
  ------------------
  |  Branch (934:10): [True: 8.64M, False: 8.38M]
  ------------------
  935|       |#else
  936|       |  (void)xd;
  937|       |  return 0;
  938|       |#endif
  939|  17.0M|}
decodeframe.c:is_global_mv_block:
  422|  14.6M|                                     TransformationType type) {
  423|  14.6M|  const PREDICTION_MODE mode = mbmi->mode;
  424|  14.6M|  const BLOCK_SIZE bsize = mbmi->bsize;
  425|  14.6M|  const int block_size_allowed =
  426|  14.6M|      AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
  ------------------
  |  |   34|  14.6M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 3.34M, False: 11.2M]
  |  |  ------------------
  ------------------
  427|  14.6M|  return (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) && type > TRANSLATION &&
  ------------------
  |  Branch (427:11): [True: 2.28M, False: 12.3M]
  |  Branch (427:31): [True: 255k, False: 12.1M]
  |  Branch (427:59): [True: 491k, False: 2.05M]
  ------------------
  428|  14.6M|         block_size_allowed;
  ------------------
  |  Branch (428:10): [True: 424k, False: 66.8k]
  ------------------
  429|  14.6M|}
decodeframe.c:is_masked_compound_type:
  161|  14.6M|static inline int is_masked_compound_type(COMPOUND_TYPE type) {
  162|  14.6M|  return (type == COMPOUND_WEDGE || type == COMPOUND_DIFFWTD);
  ------------------
  |  Branch (162:11): [True: 329k, False: 14.3M]
  |  Branch (162:37): [True: 490k, False: 13.8M]
  ------------------
  163|  14.6M|}
decodeframe.c:is_interintra_pred:
 1442|  11.1M|static inline int is_interintra_pred(const MB_MODE_INFO *mbmi) {
 1443|  11.1M|  return mbmi->ref_frame[0] > INTRA_FRAME &&
  ------------------
  |  Branch (1443:10): [True: 11.0M, False: 147k]
  ------------------
 1444|  11.1M|         mbmi->ref_frame[1] == INTRA_FRAME && is_interintra_allowed(mbmi);
  ------------------
  |  Branch (1444:10): [True: 958k, False: 10.0M]
  |  Branch (1444:47): [True: 958k, False: 0]
  ------------------
 1445|  11.1M|}
decodeframe.c:is_interintra_allowed:
 1425|   958k|static inline int is_interintra_allowed(const MB_MODE_INFO *mbmi) {
 1426|   958k|  return is_interintra_allowed_bsize(mbmi->bsize) &&
  ------------------
  |  Branch (1426:10): [True: 958k, False: 1]
  ------------------
 1427|   958k|         is_interintra_allowed_mode(mbmi->mode) &&
  ------------------
  |  Branch (1427:10): [True: 958k, False: 1]
  ------------------
 1428|   958k|         is_interintra_allowed_ref(mbmi->ref_frame);
  ------------------
  |  Branch (1428:10): [True: 958k, False: 18.4E]
  ------------------
 1429|   958k|}
decodeframe.c:is_interintra_allowed_bsize:
 1413|   958k|static inline int is_interintra_allowed_bsize(const BLOCK_SIZE bsize) {
 1414|   958k|  return (bsize >= BLOCK_8X8) && (bsize <= BLOCK_32X32);
  ------------------
  |  Branch (1414:10): [True: 958k, False: 0]
  |  Branch (1414:34): [True: 958k, False: 1]
  ------------------
 1415|   958k|}
decodeframe.c:is_interintra_allowed_mode:
 1417|   958k|static inline int is_interintra_allowed_mode(const PREDICTION_MODE mode) {
 1418|   958k|  return (mode >= SINGLE_INTER_MODE_START) && (mode < SINGLE_INTER_MODE_END);
  ------------------
  |  Branch (1418:10): [True: 958k, False: 0]
  |  Branch (1418:47): [True: 958k, False: 0]
  ------------------
 1419|   958k|}
decodeframe.c:is_interintra_allowed_ref:
 1421|   958k|static inline int is_interintra_allowed_ref(const MV_REFERENCE_FRAME rf[2]) {
 1422|   958k|  return (rf[0] > INTRA_FRAME) && (rf[1] <= INTRA_FRAME);
  ------------------
  |  Branch (1422:10): [True: 958k, False: 0]
  |  Branch (1422:35): [True: 958k, False: 0]
  ------------------
 1423|   958k|}
decodeframe.c:is_neighbor_overlappable:
 1494|  1.38M|static inline int is_neighbor_overlappable(const MB_MODE_INFO *mbmi) {
 1495|  1.38M|  return (is_inter_block(mbmi));
 1496|  1.38M|}
decodeframe.c:get_partition_subsize:
  991|  44.3M|                                               PARTITION_TYPE partition) {
  992|  44.3M|  if (partition == PARTITION_INVALID) {
  ------------------
  |  Branch (992:7): [True: 0, False: 44.3M]
  ------------------
  993|      0|    return BLOCK_INVALID;
  994|  44.3M|  } else {
  995|  44.3M|    const int sqr_bsize_idx = get_sqr_bsize_idx(bsize);
  996|  44.3M|    return sqr_bsize_idx >= SQR_BLOCK_SIZES
  ------------------
  |  |  129|  44.3M|#define SQR_BLOCK_SIZES 6
  ------------------
  |  Branch (996:12): [True: 0, False: 44.3M]
  ------------------
  997|  44.3M|               ? BLOCK_INVALID
  998|  44.3M|               : subsize_lookup[partition][sqr_bsize_idx];
  999|  44.3M|  }
 1000|  44.3M|}
decodeframe.c:get_sqr_bsize_idx:
  971|  44.3M|static inline int get_sqr_bsize_idx(BLOCK_SIZE bsize) {
  972|  44.3M|  switch (bsize) {
  973|  2.02M|    case BLOCK_4X4: return 0;
  ------------------
  |  Branch (973:5): [True: 2.02M, False: 42.3M]
  ------------------
  974|  8.16M|    case BLOCK_8X8: return 1;
  ------------------
  |  Branch (974:5): [True: 8.16M, False: 36.2M]
  ------------------
  975|  14.2M|    case BLOCK_16X16: return 2;
  ------------------
  |  Branch (975:5): [True: 14.2M, False: 30.1M]
  ------------------
  976|  11.2M|    case BLOCK_32X32: return 3;
  ------------------
  |  Branch (976:5): [True: 11.2M, False: 33.1M]
  ------------------
  977|  5.96M|    case BLOCK_64X64: return 4;
  ------------------
  |  Branch (977:5): [True: 5.96M, False: 38.4M]
  ------------------
  978|  2.83M|    case BLOCK_128X128: return 5;
  ------------------
  |  Branch (978:5): [True: 2.83M, False: 41.5M]
  ------------------
  979|      0|    default: return SQR_BLOCK_SIZES;
  ------------------
  |  |  129|      0|#define SQR_BLOCK_SIZES 6
  ------------------
  |  Branch (979:5): [True: 0, False: 44.3M]
  ------------------
  980|  44.3M|  }
  981|  44.3M|}
decodeframe.c:block_signals_txsize:
 1027|  17.6M|static inline int block_signals_txsize(BLOCK_SIZE bsize) {
 1028|  17.6M|  return bsize > BLOCK_4X4;
 1029|  17.6M|}
decodeframe.c:bsize_to_tx_size_cat:
 1344|  1.63M|static inline int bsize_to_tx_size_cat(BLOCK_SIZE bsize) {
 1345|  1.63M|  assert(bsize < BLOCK_SIZES_ALL);
 1346|  1.63M|  static const uint8_t bsize_to_tx_size_depth_table[BLOCK_SIZES_ALL] = {
 1347|  1.63M|    0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 2, 2, 3, 3, 4, 4,
 1348|  1.63M|  };
 1349|  1.63M|  const int depth = bsize_to_tx_size_depth_table[bsize];
 1350|  1.63M|  assert(depth <= MAX_TX_CATS);
 1351|  1.63M|  return depth - 1;
 1352|  1.63M|}
decodeframe.c:bsize_to_max_depth:
 1325|  1.63M|static inline int bsize_to_max_depth(BLOCK_SIZE bsize) {
 1326|  1.63M|  static const uint8_t bsize_to_max_depth_table[BLOCK_SIZES_ALL] = {
 1327|  1.63M|    0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 1328|  1.63M|  };
 1329|  1.63M|  return bsize_to_max_depth_table[bsize];
 1330|  1.63M|}
decodeframe.c:depth_to_tx_size:
 1354|  1.63M|static inline TX_SIZE depth_to_tx_size(int depth, BLOCK_SIZE bsize) {
 1355|  1.63M|  TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
 1356|  1.63M|  TX_SIZE tx_size = max_tx_size;
 1357|  2.52M|  for (int d = 0; d < depth; ++d) tx_size = sub_tx_size_map[tx_size];
  ------------------
  |  Branch (1357:19): [True: 889k, False: 1.63M]
  ------------------
 1358|  1.63M|  return tx_size;
 1359|  1.63M|}
decodeframe.c:tx_size_from_tx_mode:
 1134|  12.1M|static inline TX_SIZE tx_size_from_tx_mode(BLOCK_SIZE bsize, TX_MODE tx_mode) {
 1135|  12.1M|  const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
 1136|  12.1M|  const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bsize];
 1137|  12.1M|  if (bsize == BLOCK_4X4)
  ------------------
  |  Branch (1137:7): [True: 0, False: 12.1M]
  ------------------
 1138|      0|    return AOMMIN(max_txsize_lookup[bsize], largest_tx_size);
  ------------------
  |  |   34|      0|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1139|  12.1M|  if (txsize_sqr_map[max_rect_tx_size] <= largest_tx_size)
  ------------------
  |  Branch (1139:7): [True: 12.1M, False: 18.4E]
  ------------------
 1140|  12.1M|    return max_rect_tx_size;
 1141|  18.4E|  else
 1142|  18.4E|    return largest_tx_size;
 1143|  12.1M|}
decodeframe.c:av1_get_tx_size:
 1381|  43.1M|static inline TX_SIZE av1_get_tx_size(int plane, const MACROBLOCKD *xd) {
 1382|  43.1M|  const MB_MODE_INFO *mbmi = xd->mi[0];
 1383|  43.1M|  if (xd->lossless[mbmi->segment_id]) return TX_4X4;
  ------------------
  |  Branch (1383:7): [True: 943k, False: 42.2M]
  ------------------
 1384|  42.2M|  if (plane == 0) return mbmi->tx_size;
  ------------------
  |  Branch (1384:7): [True: 15.2M, False: 26.9M]
  ------------------
 1385|  26.9M|  const MACROBLOCKD_PLANE *pd = &xd->plane[plane];
 1386|  26.9M|  return av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x,
 1387|  26.9M|                               pd->subsampling_y);
 1388|  42.2M|}
decodeframe.c:av1_get_max_uv_txsize:
 1373|  37.7M|                                            int subsampling_y) {
 1374|  37.7M|  const BLOCK_SIZE plane_bsize =
 1375|  37.7M|      get_plane_block_size(bsize, subsampling_x, subsampling_y);
 1376|  37.7M|  assert(plane_bsize < BLOCK_SIZES_ALL);
 1377|  37.7M|  const TX_SIZE uv_tx = max_txsize_rect_lookup[plane_bsize];
 1378|  37.7M|  return av1_get_adjusted_tx_size(uv_tx);
 1379|  37.7M|}
decodeframe.c:av1_get_adjusted_tx_size:
 1361|  47.6M|static inline TX_SIZE av1_get_adjusted_tx_size(TX_SIZE tx_size) {
 1362|  47.6M|  switch (tx_size) {
 1363|  2.08M|    case TX_64X64:
  ------------------
  |  Branch (1363:5): [True: 2.08M, False: 45.6M]
  ------------------
 1364|  2.23M|    case TX_64X32:
  ------------------
  |  Branch (1364:5): [True: 147k, False: 47.5M]
  ------------------
 1365|  2.41M|    case TX_32X64: return TX_32X32;
  ------------------
  |  Branch (1365:5): [True: 174k, False: 47.5M]
  ------------------
 1366|   112k|    case TX_64X16: return TX_32X16;
  ------------------
  |  Branch (1366:5): [True: 112k, False: 47.5M]
  ------------------
 1367|  1.26M|    case TX_16X64: return TX_16X32;
  ------------------
  |  Branch (1367:5): [True: 1.26M, False: 46.4M]
  ------------------
 1368|  43.9M|    default: return tx_size;
  ------------------
  |  Branch (1368:5): [True: 43.9M, False: 3.77M]
  ------------------
 1369|  47.6M|  }
 1370|  47.6M|}
decodeframe.c:get_vartx_max_txsize:
 1448|  16.1M|                                       int plane) {
 1449|  16.1M|  if (xd->lossless[xd->mi[0]->segment_id]) return TX_4X4;
  ------------------
  |  Branch (1449:7): [True: 15.8k, False: 16.1M]
  ------------------
 1450|  16.1M|  const TX_SIZE max_txsize = max_txsize_rect_lookup[bsize];
 1451|  16.1M|  if (plane == 0) return max_txsize;            // luma
  ------------------
  |  Branch (1451:7): [True: 6.14M, False: 9.98M]
  ------------------
 1452|  9.98M|  return av1_get_adjusted_tx_size(max_txsize);  // chroma
 1453|  16.1M|}
decodeframe.c:av1_get_txb_size_index:
 1207|  8.31M|                                         int blk_col) {
 1208|  8.31M|  static const uint8_t tw_w_log2_table[BLOCK_SIZES_ALL] = {
 1209|  8.31M|    0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 1, 1, 2, 2, 3,
 1210|  8.31M|  };
 1211|  8.31M|  static const uint8_t tw_h_log2_table[BLOCK_SIZES_ALL] = {
 1212|  8.31M|    0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 0, 2, 1, 3, 2,
 1213|  8.31M|  };
 1214|  8.31M|  static const uint8_t stride_log2_table[BLOCK_SIZES_ALL] = {
 1215|  8.31M|    0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 2, 2, 0, 1, 0, 1, 0, 1,
 1216|  8.31M|  };
 1217|  8.31M|  const int index =
 1218|  8.31M|      ((blk_row >> tw_h_log2_table[bsize]) << stride_log2_table[bsize]) +
 1219|  8.31M|      (blk_col >> tw_w_log2_table[bsize]);
 1220|  8.31M|  assert(index < INTER_TX_SIZE_BUF_LEN);
 1221|  8.31M|  return index;
 1222|  8.31M|}
decodeframe.c:av1_get_block_dimensions:
 1516|   162k|                                            int *cols_within_bounds) {
 1517|   162k|  const int block_height = block_size_high[bsize];
 1518|   162k|  const int block_width = block_size_wide[bsize];
 1519|   162k|  const int block_rows = (xd->mb_to_bottom_edge >= 0)
  ------------------
  |  Branch (1519:26): [True: 162k, False: 716]
  ------------------
 1520|   162k|                             ? block_height
 1521|   162k|                             : (xd->mb_to_bottom_edge >> 3) + block_height;
 1522|   162k|  const int block_cols = (xd->mb_to_right_edge >= 0)
  ------------------
  |  Branch (1522:26): [True: 162k, False: 287]
  ------------------
 1523|   162k|                             ? block_width
 1524|   162k|                             : (xd->mb_to_right_edge >> 3) + block_width;
 1525|   162k|  const struct macroblockd_plane *const pd = &xd->plane[plane];
 1526|   162k|  assert(IMPLIES(plane == PLANE_TYPE_Y, pd->subsampling_x == 0));
 1527|   162k|  assert(IMPLIES(plane == PLANE_TYPE_Y, pd->subsampling_y == 0));
 1528|   162k|  assert(block_width >= block_cols);
 1529|   162k|  assert(block_height >= block_rows);
 1530|   162k|  const int plane_block_width = block_width >> pd->subsampling_x;
 1531|   162k|  const int plane_block_height = block_height >> pd->subsampling_y;
 1532|       |  // Special handling for chroma sub8x8.
 1533|   162k|  const int is_chroma_sub8_x = plane > 0 && plane_block_width < 4;
  ------------------
  |  Branch (1533:32): [True: 73.3k, False: 89.4k]
  |  Branch (1533:45): [True: 57, False: 73.2k]
  ------------------
 1534|   162k|  const int is_chroma_sub8_y = plane > 0 && plane_block_height < 4;
  ------------------
  |  Branch (1534:32): [True: 73.3k, False: 89.4k]
  |  Branch (1534:45): [True: 146, False: 73.1k]
  ------------------
 1535|   162k|  if (width) {
  ------------------
  |  Branch (1535:7): [True: 162k, False: 18.4E]
  ------------------
 1536|   162k|    *width = plane_block_width + 2 * is_chroma_sub8_x;
 1537|   162k|    assert(*width >= 0);
 1538|   162k|  }
 1539|   162k|  if (height) {
  ------------------
  |  Branch (1539:7): [True: 162k, False: 11]
  ------------------
 1540|   162k|    *height = plane_block_height + 2 * is_chroma_sub8_y;
 1541|   162k|    assert(*height >= 0);
 1542|   162k|  }
 1543|   162k|  if (rows_within_bounds) {
  ------------------
  |  Branch (1543:7): [True: 0, False: 162k]
  ------------------
 1544|      0|    *rows_within_bounds =
 1545|      0|        (block_rows >> pd->subsampling_y) + 2 * is_chroma_sub8_y;
 1546|      0|    assert(*rows_within_bounds >= 0);
 1547|      0|  }
 1548|   162k|  if (cols_within_bounds) {
  ------------------
  |  Branch (1548:7): [True: 0, False: 162k]
  ------------------
 1549|      0|    *cols_within_bounds =
 1550|      0|        (block_cols >> pd->subsampling_x) + 2 * is_chroma_sub8_x;
 1551|      0|    assert(*cols_within_bounds >= 0);
 1552|      0|  }
 1553|   162k|}
decodemv.c:is_inter_block:
  372|  31.8M|static inline int is_inter_block(const MB_MODE_INFO *mbmi) {
  373|  31.8M|  return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME;
  ------------------
  |  Branch (373:10): [True: 41.1k, False: 31.7M]
  |  Branch (373:36): [True: 15.2M, False: 16.5M]
  ------------------
  374|  31.8M|}
decodemv.c:is_intrabc_block:
  345|  33.7M|static inline int is_intrabc_block(const MB_MODE_INFO *mbmi) {
  346|  33.7M|  return mbmi->use_intrabc;
  347|  33.7M|}
decodemv.c:get_ext_tx_types:
 1125|  9.47M|                                   int use_reduced_set) {
 1126|  9.47M|  const int set_type =
 1127|  9.47M|      av1_get_ext_tx_set_type(tx_size, is_inter, use_reduced_set);
 1128|  9.47M|  return av1_num_ext_tx_set[set_type];
 1129|  9.47M|}
decodemv.c:av1_get_ext_tx_set_type:
 1098|  24.8M|                                                int use_reduced_set) {
 1099|  24.8M|  const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
 1100|  24.8M|  if (tx_size_sqr_up > TX_32X32) return EXT_TX_SET_DCTONLY;
  ------------------
  |  Branch (1100:7): [True: 647k, False: 24.2M]
  ------------------
 1101|  24.2M|  if (tx_size_sqr_up == TX_32X32)
  ------------------
  |  Branch (1101:7): [True: 2.42M, False: 21.8M]
  ------------------
 1102|  2.42M|    return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DCTONLY;
  ------------------
  |  Branch (1102:12): [True: 1.29M, False: 1.12M]
  ------------------
 1103|  21.8M|  if (use_reduced_set)
  ------------------
  |  Branch (1103:7): [True: 8.18M, False: 13.6M]
  ------------------
 1104|  8.18M|    return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DTT4_IDTX;
  ------------------
  |  Branch (1104:12): [True: 3.56M, False: 4.62M]
  ------------------
 1105|  13.6M|  const TX_SIZE tx_size_sqr = txsize_sqr_map[tx_size];
 1106|  13.6M|  return av1_ext_tx_set_lookup[is_inter][tx_size_sqr == TX_16X16];
 1107|  21.8M|}
decodemv.c:get_ext_tx_set:
 1118|  7.71M|                                 int use_reduced_set) {
 1119|  7.71M|  const TxSetType set_type =
 1120|  7.71M|      av1_get_ext_tx_set_type(tx_size, is_inter, use_reduced_set);
 1121|  7.71M|  return ext_tx_set_index[is_inter][set_type];
 1122|  7.71M|}
decodemv.c:get_plane_block_size:
 1188|  88.7k|                                              int subsampling_y) {
 1189|  88.7k|  assert(bsize < BLOCK_SIZES_ALL);
 1190|  88.7k|  assert(subsampling_x >= 0 && subsampling_x < 2);
 1191|  88.7k|  assert(subsampling_y >= 0 && subsampling_y < 2);
 1192|  88.7k|  return av1_ss_size_lookup[bsize][subsampling_x][subsampling_y];
 1193|  88.7k|}
decodemv.c:get_uv_mode:
  349|  9.87M|static inline PREDICTION_MODE get_uv_mode(UV_PREDICTION_MODE mode) {
  350|  9.87M|  assert(mode < UV_INTRA_MODES);
  351|  9.87M|  static const PREDICTION_MODE uv2y[] = {
  352|  9.87M|    DC_PRED,        // UV_DC_PRED
  353|  9.87M|    V_PRED,         // UV_V_PRED
  354|  9.87M|    H_PRED,         // UV_H_PRED
  355|  9.87M|    D45_PRED,       // UV_D45_PRED
  356|  9.87M|    D135_PRED,      // UV_D135_PRED
  357|  9.87M|    D113_PRED,      // UV_D113_PRED
  358|  9.87M|    D157_PRED,      // UV_D157_PRED
  359|  9.87M|    D203_PRED,      // UV_D203_PRED
  360|  9.87M|    D67_PRED,       // UV_D67_PRED
  361|  9.87M|    SMOOTH_PRED,    // UV_SMOOTH_PRED
  362|  9.87M|    SMOOTH_V_PRED,  // UV_SMOOTH_V_PRED
  363|  9.87M|    SMOOTH_H_PRED,  // UV_SMOOTH_H_PRED
  364|  9.87M|    PAETH_PRED,     // UV_PAETH_PRED
  365|  9.87M|    DC_PRED,        // UV_CFL_PRED
  366|  9.87M|    INTRA_INVALID,  // UV_INTRA_MODES
  367|  9.87M|    INTRA_INVALID,  // UV_MODE_INVALID
  368|  9.87M|  };
  369|  9.87M|  return uv2y[mode];
  370|  9.87M|}
decodemv.c:av1_allow_palette:
 1499|  13.3M|                                    BLOCK_SIZE sb_type) {
 1500|  13.3M|  assert(sb_type < BLOCK_SIZES_ALL);
 1501|  13.3M|  return allow_screen_content_tools &&
  ------------------
  |  Branch (1501:10): [True: 5.20M, False: 8.11M]
  ------------------
 1502|  13.3M|         block_size_wide[sb_type] <= MAX_PALETTE_BLOCK_WIDTH &&
  ------------------
  |  |   44|  18.5M|#define MAX_PALETTE_BLOCK_WIDTH 64
  ------------------
  |  Branch (1502:10): [True: 5.12M, False: 77.3k]
  ------------------
 1503|  13.3M|         block_size_high[sb_type] <= MAX_PALETTE_BLOCK_HEIGHT &&
  ------------------
  |  |   46|  18.4M|#define MAX_PALETTE_BLOCK_HEIGHT 64
  ------------------
  |  Branch (1503:10): [True: 5.11M, False: 17.6k]
  ------------------
 1504|  13.3M|         sb_type >= BLOCK_8X8;
  ------------------
  |  Branch (1504:10): [True: 4.65M, False: 460k]
  ------------------
 1505|  13.3M|}
decodemv.c:is_comp_ref_allowed:
   65|  5.45M|static inline int is_comp_ref_allowed(BLOCK_SIZE bsize) {
   66|  5.45M|  return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
  ------------------
  |  |   34|  5.45M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.32M, False: 4.12M]
  |  |  ------------------
  ------------------
   67|  5.45M|}
decodemv.c:has_second_ref:
  376|  35.9M|static inline int has_second_ref(const MB_MODE_INFO *mbmi) {
  377|  35.9M|  return mbmi->ref_frame[1] > INTRA_FRAME;
  378|  35.9M|}
decodemv.c:comp_ref0:
  385|   772k|static inline MV_REFERENCE_FRAME comp_ref0(int ref_idx) {
  386|   772k|  static const MV_REFERENCE_FRAME lut[] = {
  387|   772k|    LAST_FRAME,     // LAST_LAST2_FRAMES,
  388|   772k|    LAST_FRAME,     // LAST_LAST3_FRAMES,
  389|   772k|    LAST_FRAME,     // LAST_GOLDEN_FRAMES,
  390|   772k|    BWDREF_FRAME,   // BWDREF_ALTREF_FRAMES,
  391|   772k|    LAST2_FRAME,    // LAST2_LAST3_FRAMES
  392|   772k|    LAST2_FRAME,    // LAST2_GOLDEN_FRAMES,
  393|   772k|    LAST3_FRAME,    // LAST3_GOLDEN_FRAMES,
  394|   772k|    BWDREF_FRAME,   // BWDREF_ALTREF2_FRAMES,
  395|   772k|    ALTREF2_FRAME,  // ALTREF2_ALTREF_FRAMES,
  396|   772k|  };
  397|   772k|  assert(NELEMENTS(lut) == TOTAL_UNIDIR_COMP_REFS);
  398|   772k|  return lut[ref_idx];
  399|   772k|}
decodemv.c:comp_ref1:
  401|   479k|static inline MV_REFERENCE_FRAME comp_ref1(int ref_idx) {
  402|   479k|  static const MV_REFERENCE_FRAME lut[] = {
  403|   479k|    LAST2_FRAME,    // LAST_LAST2_FRAMES,
  404|   479k|    LAST3_FRAME,    // LAST_LAST3_FRAMES,
  405|   479k|    GOLDEN_FRAME,   // LAST_GOLDEN_FRAMES,
  406|   479k|    ALTREF_FRAME,   // BWDREF_ALTREF_FRAMES,
  407|   479k|    LAST3_FRAME,    // LAST2_LAST3_FRAMES
  408|   479k|    GOLDEN_FRAME,   // LAST2_GOLDEN_FRAMES,
  409|   479k|    GOLDEN_FRAME,   // LAST3_GOLDEN_FRAMES,
  410|   479k|    ALTREF2_FRAME,  // BWDREF_ALTREF2_FRAMES,
  411|   479k|    ALTREF_FRAME,   // ALTREF2_ALTREF_FRAMES,
  412|   479k|  };
  413|   479k|  assert(NELEMENTS(lut) == TOTAL_UNIDIR_COMP_REFS);
  414|   479k|  return lut[ref_idx];
  415|   479k|}
decodemv.c:have_nearmv_in_inter_mode:
  151|  3.03M|static inline int have_nearmv_in_inter_mode(PREDICTION_MODE mode) {
  152|  3.03M|  return (mode == NEARMV || mode == NEAR_NEARMV || mode == NEAR_NEWMV ||
  ------------------
  |  Branch (152:11): [True: 633k, False: 2.40M]
  |  Branch (152:29): [True: 197k, False: 2.20M]
  |  Branch (152:52): [True: 65.0k, False: 2.14M]
  ------------------
  153|  3.03M|          mode == NEW_NEARMV);
  ------------------
  |  Branch (153:11): [True: 25.4k, False: 2.11M]
  ------------------
  154|  3.03M|}
decodemv.c:is_inter_compound_mode:
   81|  6.48M|static inline int is_inter_compound_mode(PREDICTION_MODE mode) {
   82|  6.48M|  return mode >= COMP_INTER_MODE_START && mode < COMP_INTER_MODE_END;
  ------------------
  |  Branch (82:10): [True: 2.57M, False: 3.91M]
  |  Branch (82:43): [True: 2.57M, False: 18.4E]
  ------------------
   83|  6.48M|}
decodemv.c:compound_ref0_mode:
   85|   634k|static inline PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) {
   86|   634k|  static const PREDICTION_MODE lut[] = {
   87|   634k|    DC_PRED,        // DC_PRED
   88|   634k|    V_PRED,         // V_PRED
   89|   634k|    H_PRED,         // H_PRED
   90|   634k|    D45_PRED,       // D45_PRED
   91|   634k|    D135_PRED,      // D135_PRED
   92|   634k|    D113_PRED,      // D113_PRED
   93|   634k|    D157_PRED,      // D157_PRED
   94|   634k|    D203_PRED,      // D203_PRED
   95|   634k|    D67_PRED,       // D67_PRED
   96|   634k|    SMOOTH_PRED,    // SMOOTH_PRED
   97|   634k|    SMOOTH_V_PRED,  // SMOOTH_V_PRED
   98|   634k|    SMOOTH_H_PRED,  // SMOOTH_H_PRED
   99|   634k|    PAETH_PRED,     // PAETH_PRED
  100|   634k|    NEARESTMV,      // NEARESTMV
  101|   634k|    NEARMV,         // NEARMV
  102|   634k|    GLOBALMV,       // GLOBALMV
  103|   634k|    NEWMV,          // NEWMV
  104|   634k|    NEARESTMV,      // NEAREST_NEARESTMV
  105|   634k|    NEARMV,         // NEAR_NEARMV
  106|   634k|    NEARESTMV,      // NEAREST_NEWMV
  107|   634k|    NEWMV,          // NEW_NEARESTMV
  108|   634k|    NEARMV,         // NEAR_NEWMV
  109|   634k|    NEWMV,          // NEW_NEARMV
  110|   634k|    GLOBALMV,       // GLOBAL_GLOBALMV
  111|   634k|    NEWMV,          // NEW_NEWMV
  112|   634k|  };
  113|   634k|  assert(NELEMENTS(lut) == MB_MODE_COUNT);
  114|   633k|  assert(is_inter_compound_mode(mode) || is_inter_singleref_mode(mode));
  115|   634k|  return lut[mode];
  116|   633k|}
decodemv.c:compound_ref1_mode:
  118|   634k|static inline PREDICTION_MODE compound_ref1_mode(PREDICTION_MODE mode) {
  119|   634k|  static const PREDICTION_MODE lut[] = {
  120|   634k|    MB_MODE_COUNT,  // DC_PRED
  121|   634k|    MB_MODE_COUNT,  // V_PRED
  122|   634k|    MB_MODE_COUNT,  // H_PRED
  123|   634k|    MB_MODE_COUNT,  // D45_PRED
  124|   634k|    MB_MODE_COUNT,  // D135_PRED
  125|   634k|    MB_MODE_COUNT,  // D113_PRED
  126|   634k|    MB_MODE_COUNT,  // D157_PRED
  127|   634k|    MB_MODE_COUNT,  // D203_PRED
  128|   634k|    MB_MODE_COUNT,  // D67_PRED
  129|   634k|    MB_MODE_COUNT,  // SMOOTH_PRED
  130|   634k|    MB_MODE_COUNT,  // SMOOTH_V_PRED
  131|   634k|    MB_MODE_COUNT,  // SMOOTH_H_PRED
  132|   634k|    MB_MODE_COUNT,  // PAETH_PRED
  133|   634k|    MB_MODE_COUNT,  // NEARESTMV
  134|   634k|    MB_MODE_COUNT,  // NEARMV
  135|   634k|    MB_MODE_COUNT,  // GLOBALMV
  136|   634k|    MB_MODE_COUNT,  // NEWMV
  137|   634k|    NEARESTMV,      // NEAREST_NEARESTMV
  138|   634k|    NEARMV,         // NEAR_NEARMV
  139|   634k|    NEWMV,          // NEAREST_NEWMV
  140|   634k|    NEARESTMV,      // NEW_NEARESTMV
  141|   634k|    NEWMV,          // NEAR_NEWMV
  142|   634k|    NEARMV,         // NEW_NEARMV
  143|   634k|    GLOBALMV,       // GLOBAL_GLOBALMV
  144|   634k|    NEWMV,          // NEW_NEWMV
  145|   634k|  };
  146|   634k|  assert(NELEMENTS(lut) == MB_MODE_COUNT);
  147|   634k|  assert(is_inter_compound_mode(mode));
  148|   634k|  return lut[mode];
  149|   634k|}
decodemv.c:is_interintra_allowed:
 1425|  4.20M|static inline int is_interintra_allowed(const MB_MODE_INFO *mbmi) {
 1426|  4.20M|  return is_interintra_allowed_bsize(mbmi->bsize) &&
  ------------------
  |  Branch (1426:10): [True: 2.23M, False: 1.97M]
  ------------------
 1427|  4.20M|         is_interintra_allowed_mode(mbmi->mode) &&
  ------------------
  |  Branch (1427:10): [True: 1.85M, False: 378k]
  ------------------
 1428|  4.20M|         is_interintra_allowed_ref(mbmi->ref_frame);
  ------------------
  |  Branch (1428:10): [True: 1.85M, False: 18.4E]
  ------------------
 1429|  4.20M|}
decodemv.c:is_interintra_allowed_bsize:
 1413|  4.20M|static inline int is_interintra_allowed_bsize(const BLOCK_SIZE bsize) {
 1414|  4.20M|  return (bsize >= BLOCK_8X8) && (bsize <= BLOCK_32X32);
  ------------------
  |  Branch (1414:10): [True: 3.41M, False: 792k]
  |  Branch (1414:34): [True: 2.23M, False: 1.18M]
  ------------------
 1415|  4.20M|}
decodemv.c:is_interintra_allowed_mode:
 1417|  2.23M|static inline int is_interintra_allowed_mode(const PREDICTION_MODE mode) {
 1418|  2.23M|  return (mode >= SINGLE_INTER_MODE_START) && (mode < SINGLE_INTER_MODE_END);
  ------------------
  |  Branch (1418:10): [True: 2.23M, False: 18.4E]
  |  Branch (1418:47): [True: 1.85M, False: 378k]
  ------------------
 1419|  2.23M|}
decodemv.c:is_interintra_allowed_ref:
 1421|  1.85M|static inline int is_interintra_allowed_ref(const MV_REFERENCE_FRAME rf[2]) {
 1422|  1.85M|  return (rf[0] > INTRA_FRAME) && (rf[1] <= INTRA_FRAME);
  ------------------
  |  Branch (1422:10): [True: 1.85M, False: 18.4E]
  |  Branch (1422:35): [True: 1.85M, False: 6]
  ------------------
 1423|  1.85M|}
decodemv.c:is_motion_variation_allowed_bsize:
 1455|  6.68M|static inline int is_motion_variation_allowed_bsize(BLOCK_SIZE bsize) {
 1456|  6.68M|  assert(bsize < BLOCK_SIZES_ALL);
 1457|  6.68M|  return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
  ------------------
  |  |   34|  6.68M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.58M, False: 5.09M]
  |  |  ------------------
  ------------------
 1458|  6.68M|}
decodemv.c:motion_mode_allowed:
 1474|  3.67M|    const MB_MODE_INFO *mbmi, int allow_warped_motion) {
 1475|  3.67M|  if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION;
  ------------------
  |  Branch (1475:7): [True: 1.46M, False: 2.21M]
  ------------------
 1476|  2.21M|  if (xd->cur_frame_force_integer_mv == 0) {
  ------------------
  |  Branch (1476:7): [True: 2.11M, False: 95.3k]
  ------------------
 1477|  2.11M|    const TransformationType gm_type = gm_params[mbmi->ref_frame[0]].wmtype;
 1478|  2.11M|    if (is_global_mv_block(mbmi, gm_type)) return SIMPLE_TRANSLATION;
  ------------------
  |  Branch (1478:9): [True: 85.6k, False: 2.03M]
  ------------------
 1479|  2.11M|  }
 1480|  2.12M|  if (is_motion_variation_allowed_bsize(mbmi->bsize) &&
  ------------------
  |  Branch (1480:7): [True: 2.13M, False: 18.4E]
  ------------------
 1481|  2.13M|      is_inter_mode(mbmi->mode) && mbmi->ref_frame[1] != INTRA_FRAME &&
  ------------------
  |  Branch (1481:7): [True: 2.13M, False: 18.4E]
  |  Branch (1481:36): [True: 2.13M, False: 18.4E]
  ------------------
 1482|  2.13M|      is_motion_variation_allowed_compound(mbmi)) {
  ------------------
  |  Branch (1482:7): [True: 1.76M, False: 367k]
  ------------------
 1483|  1.76M|    assert(!has_second_ref(mbmi));
 1484|  1.76M|    if (mbmi->num_proj_ref >= 1 && allow_warped_motion &&
  ------------------
  |  Branch (1484:9): [True: 1.60M, False: 160k]
  |  Branch (1484:36): [True: 1.46M, False: 133k]
  ------------------
 1485|  1.76M|        !xd->cur_frame_force_integer_mv &&
  ------------------
  |  Branch (1485:9): [True: 1.46M, False: 2.02k]
  ------------------
 1486|  1.76M|        !av1_is_scaled(xd->block_ref_scale_factors[0])) {
  ------------------
  |  Branch (1486:9): [True: 1.36M, False: 104k]
  ------------------
 1487|  1.36M|      return WARPED_CAUSAL;
 1488|  1.36M|    }
 1489|   400k|    return OBMC_CAUSAL;
 1490|  1.76M|  }
 1491|   366k|  return SIMPLE_TRANSLATION;
 1492|  2.12M|}
decodemv.c:check_num_overlappable_neighbors:
 1468|  3.67M|static inline int check_num_overlappable_neighbors(const MB_MODE_INFO *mbmi) {
 1469|  3.67M|  return mbmi->overlappable_neighbors != 0;
 1470|  3.67M|}
decodemv.c:is_global_mv_block:
  422|  2.11M|                                     TransformationType type) {
  423|  2.11M|  const PREDICTION_MODE mode = mbmi->mode;
  424|  2.11M|  const BLOCK_SIZE bsize = mbmi->bsize;
  425|  2.11M|  const int block_size_allowed =
  426|  2.11M|      AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
  ------------------
  |  |   34|  2.11M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 423k, False: 1.69M]
  |  |  ------------------
  ------------------
  427|  2.11M|  return (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) && type > TRANSLATION &&
  ------------------
  |  Branch (427:11): [True: 343k, False: 1.77M]
  |  Branch (427:31): [True: 30.1k, False: 1.74M]
  |  Branch (427:59): [True: 85.6k, False: 288k]
  ------------------
  428|  2.11M|         block_size_allowed;
  ------------------
  |  Branch (428:10): [True: 85.6k, False: 0]
  ------------------
  429|  2.11M|}
decodemv.c:is_inter_mode:
   69|  2.13M|static inline int is_inter_mode(PREDICTION_MODE mode) {
   70|  2.13M|  return mode >= INTER_MODE_START && mode < INTER_MODE_END;
  ------------------
  |  Branch (70:10): [True: 2.13M, False: 18.4E]
  |  Branch (70:38): [True: 2.13M, False: 18.4E]
  ------------------
   71|  2.13M|}
decodemv.c:is_motion_variation_allowed_compound:
 1461|  2.13M|    const MB_MODE_INFO *mbmi) {
 1462|  2.13M|  return !has_second_ref(mbmi);
 1463|  2.13M|}
decodemv.c:is_masked_compound_type:
  161|  1.61M|static inline int is_masked_compound_type(COMPOUND_TYPE type) {
  162|  1.61M|  return (type == COMPOUND_WEDGE || type == COMPOUND_DIFFWTD);
  ------------------
  |  Branch (162:11): [True: 518k, False: 1.09M]
  |  Branch (162:37): [True: 50.8k, False: 1.04M]
  ------------------
  163|  1.61M|}
decodemv.c:is_nontrans_global_motion:
 1576|  4.13M|                                            const MB_MODE_INFO *mbmi) {
 1577|  4.13M|  int ref;
 1578|       |
 1579|       |  // First check if all modes are GLOBALMV
 1580|  4.13M|  if (mbmi->mode != GLOBALMV && mbmi->mode != GLOBAL_GLOBALMV) return 0;
  ------------------
  |  Branch (1580:7): [True: 3.36M, False: 770k]
  |  Branch (1580:33): [True: 3.31M, False: 50.6k]
  ------------------
 1581|       |
 1582|   821k|  if (AOMMIN(mi_size_wide[mbmi->bsize], mi_size_high[mbmi->bsize]) < 2)
  ------------------
  |  |   34|   821k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 202k, False: 619k]
  |  |  ------------------
  ------------------
  |  Branch (1582:7): [True: 299k, False: 521k]
  ------------------
 1583|   299k|    return 0;
 1584|       |
 1585|       |  // Now check if all global motion is non translational
 1586|  1.05M|  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
  ------------------
  |  Branch (1586:17): [True: 570k, False: 482k]
  ------------------
 1587|   570k|    if (xd->global_motion[mbmi->ref_frame[ref]].wmtype == TRANSLATION) return 0;
  ------------------
  |  Branch (1587:9): [True: 39.1k, False: 530k]
  ------------------
 1588|   570k|  }
 1589|   482k|  return 1;
 1590|   521k|}
decoder.c:is_inter_block:
  372|  38.3M|static inline int is_inter_block(const MB_MODE_INFO *mbmi) {
  373|  38.3M|  return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME;
  ------------------
  |  Branch (373:10): [True: 108k, False: 38.2M]
  |  Branch (373:36): [True: 12.3M, False: 25.8M]
  ------------------
  374|  38.3M|}
decoder.c:is_intrabc_block:
  345|  38.3M|static inline int is_intrabc_block(const MB_MODE_INFO *mbmi) {
  346|  38.3M|  return mbmi->use_intrabc;
  347|  38.3M|}
decodetxb.c:get_plane_block_size:
 1188|  35.9M|                                              int subsampling_y) {
 1189|  35.9M|  assert(bsize < BLOCK_SIZES_ALL);
 1190|  35.9M|  assert(subsampling_x >= 0 && subsampling_x < 2);
 1191|  36.0M|  assert(subsampling_y >= 0 && subsampling_y < 2);
 1192|  36.1M|  return av1_ss_size_lookup[bsize][subsampling_x][subsampling_y];
 1193|  36.0M|}
decodetxb.c:av1_get_adjusted_tx_size:
 1361|   103M|static inline TX_SIZE av1_get_adjusted_tx_size(TX_SIZE tx_size) {
 1362|   103M|  switch (tx_size) {
 1363|  1.84M|    case TX_64X64:
  ------------------
  |  Branch (1363:5): [True: 1.84M, False: 101M]
  ------------------
 1364|  2.14M|    case TX_64X32:
  ------------------
  |  Branch (1364:5): [True: 301k, False: 102M]
  ------------------
 1365|  2.24M|    case TX_32X64: return TX_32X32;
  ------------------
  |  Branch (1365:5): [True: 101k, False: 103M]
  ------------------
 1366|  1.26M|    case TX_64X16: return TX_32X16;
  ------------------
  |  Branch (1366:5): [True: 1.26M, False: 101M]
  ------------------
 1367|   116k|    case TX_16X64: return TX_16X32;
  ------------------
  |  Branch (1367:5): [True: 116k, False: 103M]
  ------------------
 1368|   100M|    default: return tx_size;
  ------------------
  |  Branch (1368:5): [True: 100M, False: 2.57M]
  ------------------
 1369|   103M|  }
 1370|   103M|}
decodetxb.c:is_inter_block:
  372|  85.3M|static inline int is_inter_block(const MB_MODE_INFO *mbmi) {
  373|  85.3M|  return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME;
  ------------------
  |  Branch (373:10): [True: 488k, False: 84.8M]
  |  Branch (373:36): [True: 42.2M, False: 42.6M]
  ------------------
  374|  85.3M|}
decodetxb.c:is_intrabc_block:
  345|  85.4M|static inline int is_intrabc_block(const MB_MODE_INFO *mbmi) {
  346|  85.4M|  return mbmi->use_intrabc;
  347|  85.4M|}
decodetxb.c:get_plane_type:
 1592|  46.4M|static inline PLANE_TYPE get_plane_type(int plane) {
 1593|  46.4M|  return (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
  ------------------
  |  Branch (1593:10): [True: 19.9M, False: 26.5M]
  ------------------
 1594|  46.4M|}
decodetxb.c:av1_get_tx_type:
 1281|  31.3M|                                      int reduced_tx_set) {
 1282|  31.3M|  const MB_MODE_INFO *const mbmi = xd->mi[0];
 1283|  31.3M|  if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32) {
  ------------------
  |  Branch (1283:7): [True: 5.45M, False: 25.9M]
  |  Branch (1283:41): [True: 825k, False: 25.1M]
  ------------------
 1284|  6.37M|    return DCT_DCT;
 1285|  6.37M|  }
 1286|       |
 1287|  25.0M|  TX_TYPE tx_type;
 1288|  25.0M|  if (plane_type == PLANE_TYPE_Y) {
  ------------------
  |  Branch (1288:7): [True: 12.8M, False: 12.1M]
  ------------------
 1289|  12.8M|    tx_type = xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col];
 1290|  12.8M|  } else {
 1291|  12.1M|    if (is_inter_block(mbmi)) {
  ------------------
  |  Branch (1291:9): [True: 8.23M, False: 3.91M]
  ------------------
 1292|       |      // scale back to y plane's coordinate
 1293|  8.23M|      const struct macroblockd_plane *const pd = &xd->plane[plane_type];
 1294|  8.23M|      blk_row <<= pd->subsampling_y;
 1295|  8.23M|      blk_col <<= pd->subsampling_x;
 1296|  8.23M|      tx_type = xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col];
 1297|  8.23M|    } else {
 1298|       |      // In intra mode, uv planes don't share the same prediction mode as y
 1299|       |      // plane, so the tx_type should not be shared
 1300|  3.91M|      tx_type = intra_mode_to_tx_type(mbmi, PLANE_TYPE_UV);
 1301|  3.91M|    }
 1302|  12.1M|    const TxSetType tx_set_type =
 1303|  12.1M|        av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi), reduced_tx_set);
 1304|  12.1M|    if (!av1_ext_tx_used[tx_set_type][tx_type]) tx_type = DCT_DCT;
  ------------------
  |  Branch (1304:9): [True: 259k, False: 11.8M]
  ------------------
 1305|  12.1M|  }
 1306|  25.0M|  assert(tx_type < TX_TYPES);
 1307|  25.1M|  assert(av1_ext_tx_used[av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi),
 1308|  25.1M|                                                 reduced_tx_set)][tx_type]);
 1309|  25.1M|  return tx_type;
 1310|  25.1M|}
decodetxb.c:intra_mode_to_tx_type:
 1003|  4.05M|                                     PLANE_TYPE plane_type) {
 1004|  4.05M|  static const TX_TYPE _intra_mode_to_tx_type[INTRA_MODES] = {
 1005|  4.05M|    DCT_DCT,    // DC_PRED
 1006|  4.05M|    ADST_DCT,   // V_PRED
 1007|  4.05M|    DCT_ADST,   // H_PRED
 1008|  4.05M|    DCT_DCT,    // D45_PRED
 1009|  4.05M|    ADST_ADST,  // D135_PRED
 1010|  4.05M|    ADST_DCT,   // D113_PRED
 1011|  4.05M|    DCT_ADST,   // D157_PRED
 1012|  4.05M|    DCT_ADST,   // D203_PRED
 1013|  4.05M|    ADST_DCT,   // D67_PRED
 1014|  4.05M|    ADST_ADST,  // SMOOTH_PRED
 1015|  4.05M|    ADST_DCT,   // SMOOTH_V_PRED
 1016|  4.05M|    DCT_ADST,   // SMOOTH_H_PRED
 1017|  4.05M|    ADST_ADST,  // PAETH_PRED
 1018|  4.05M|  };
 1019|  4.05M|  const PREDICTION_MODE mode =
 1020|  4.05M|      (plane_type == PLANE_TYPE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode);
  ------------------
  |  Branch (1020:7): [True: 0, False: 4.05M]
  ------------------
 1021|  4.05M|  assert(mode < INTRA_MODES);
 1022|  4.05M|  return _intra_mode_to_tx_type[mode];
 1023|  4.05M|}
decodetxb.c:get_uv_mode:
  349|  4.05M|static inline PREDICTION_MODE get_uv_mode(UV_PREDICTION_MODE mode) {
  350|  4.05M|  assert(mode < UV_INTRA_MODES);
  351|  4.05M|  static const PREDICTION_MODE uv2y[] = {
  352|  4.05M|    DC_PRED,        // UV_DC_PRED
  353|  4.05M|    V_PRED,         // UV_V_PRED
  354|  4.05M|    H_PRED,         // UV_H_PRED
  355|  4.05M|    D45_PRED,       // UV_D45_PRED
  356|  4.05M|    D135_PRED,      // UV_D135_PRED
  357|  4.05M|    D113_PRED,      // UV_D113_PRED
  358|  4.05M|    D157_PRED,      // UV_D157_PRED
  359|  4.05M|    D203_PRED,      // UV_D203_PRED
  360|  4.05M|    D67_PRED,       // UV_D67_PRED
  361|  4.05M|    SMOOTH_PRED,    // UV_SMOOTH_PRED
  362|  4.05M|    SMOOTH_V_PRED,  // UV_SMOOTH_V_PRED
  363|  4.05M|    SMOOTH_H_PRED,  // UV_SMOOTH_H_PRED
  364|  4.05M|    PAETH_PRED,     // UV_PAETH_PRED
  365|  4.05M|    DC_PRED,        // UV_CFL_PRED
  366|  4.05M|    INTRA_INVALID,  // UV_INTRA_MODES
  367|  4.05M|    INTRA_INVALID,  // UV_MODE_INVALID
  368|  4.05M|  };
  369|  4.05M|  return uv2y[mode];
  370|  4.05M|}
decodetxb.c:av1_get_ext_tx_set_type:
 1098|  37.3M|                                                int use_reduced_set) {
 1099|  37.3M|  const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
 1100|  37.3M|  if (tx_size_sqr_up > TX_32X32) return EXT_TX_SET_DCTONLY;
  ------------------
  |  Branch (1100:7): [True: 0, False: 37.3M]
  ------------------
 1101|  37.3M|  if (tx_size_sqr_up == TX_32X32)
  ------------------
  |  Branch (1101:7): [True: 4.56M, False: 32.8M]
  ------------------
 1102|  4.56M|    return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DCTONLY;
  ------------------
  |  Branch (1102:12): [True: 1.94M, False: 2.62M]
  ------------------
 1103|  32.8M|  if (use_reduced_set)
  ------------------
  |  Branch (1103:7): [True: 9.37M, False: 23.4M]
  ------------------
 1104|  9.37M|    return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DTT4_IDTX;
  ------------------
  |  Branch (1104:12): [True: 6.80M, False: 2.56M]
  ------------------
 1105|  23.4M|  const TX_SIZE tx_size_sqr = txsize_sqr_map[tx_size];
 1106|  23.4M|  return av1_ext_tx_set_lookup[is_inter][tx_size_sqr == TX_16X16];
 1107|  32.8M|}
detokenize.c:av1_get_block_dimensions:
 1516|   161k|                                            int *cols_within_bounds) {
 1517|   161k|  const int block_height = block_size_high[bsize];
 1518|   161k|  const int block_width = block_size_wide[bsize];
 1519|   161k|  const int block_rows = (xd->mb_to_bottom_edge >= 0)
  ------------------
  |  Branch (1519:26): [True: 161k, False: 713]
  ------------------
 1520|   161k|                             ? block_height
 1521|   161k|                             : (xd->mb_to_bottom_edge >> 3) + block_height;
 1522|   161k|  const int block_cols = (xd->mb_to_right_edge >= 0)
  ------------------
  |  Branch (1522:26): [True: 161k, False: 281]
  ------------------
 1523|   161k|                             ? block_width
 1524|   161k|                             : (xd->mb_to_right_edge >> 3) + block_width;
 1525|   161k|  const struct macroblockd_plane *const pd = &xd->plane[plane];
 1526|   161k|  assert(IMPLIES(plane == PLANE_TYPE_Y, pd->subsampling_x == 0));
 1527|   161k|  assert(IMPLIES(plane == PLANE_TYPE_Y, pd->subsampling_y == 0));
 1528|   161k|  assert(block_width >= block_cols);
 1529|   161k|  assert(block_height >= block_rows);
 1530|   161k|  const int plane_block_width = block_width >> pd->subsampling_x;
 1531|   161k|  const int plane_block_height = block_height >> pd->subsampling_y;
 1532|       |  // Special handling for chroma sub8x8.
 1533|   161k|  const int is_chroma_sub8_x = plane > 0 && plane_block_width < 4;
  ------------------
  |  Branch (1533:32): [True: 73.1k, False: 88.6k]
  |  Branch (1533:45): [True: 50, False: 73.0k]
  ------------------
 1534|   161k|  const int is_chroma_sub8_y = plane > 0 && plane_block_height < 4;
  ------------------
  |  Branch (1534:32): [True: 73.1k, False: 88.6k]
  |  Branch (1534:45): [True: 125, False: 73.0k]
  ------------------
 1535|   161k|  if (width) {
  ------------------
  |  Branch (1535:7): [True: 161k, False: 25]
  ------------------
 1536|   161k|    *width = plane_block_width + 2 * is_chroma_sub8_x;
 1537|   161k|    assert(*width >= 0);
 1538|   161k|  }
 1539|   161k|  if (height) {
  ------------------
  |  Branch (1539:7): [True: 161k, False: 19]
  ------------------
 1540|   161k|    *height = plane_block_height + 2 * is_chroma_sub8_y;
 1541|   161k|    assert(*height >= 0);
 1542|   161k|  }
 1543|   161k|  if (rows_within_bounds) {
  ------------------
  |  Branch (1543:7): [True: 161k, False: 14]
  ------------------
 1544|   161k|    *rows_within_bounds =
 1545|   161k|        (block_rows >> pd->subsampling_y) + 2 * is_chroma_sub8_y;
 1546|   161k|    assert(*rows_within_bounds >= 0);
 1547|   161k|  }
 1548|   161k|  if (cols_within_bounds) {
  ------------------
  |  Branch (1548:7): [True: 161k, False: 14]
  ------------------
 1549|   161k|    *cols_within_bounds =
 1550|   161k|        (block_cols >> pd->subsampling_x) + 2 * is_chroma_sub8_x;
 1551|   161k|    assert(*cols_within_bounds >= 0);
 1552|   161k|  }
 1553|   161k|}
av1_loopfilter.c:av1_get_max_uv_txsize:
 1373|  45.5M|                                            int subsampling_y) {
 1374|  45.5M|  const BLOCK_SIZE plane_bsize =
 1375|  45.5M|      get_plane_block_size(bsize, subsampling_x, subsampling_y);
 1376|  45.5M|  assert(plane_bsize < BLOCK_SIZES_ALL);
 1377|  50.9M|  const TX_SIZE uv_tx = max_txsize_rect_lookup[plane_bsize];
 1378|  50.9M|  return av1_get_adjusted_tx_size(uv_tx);
 1379|  45.5M|}
av1_loopfilter.c:av1_get_adjusted_tx_size:
 1361|  51.2M|static inline TX_SIZE av1_get_adjusted_tx_size(TX_SIZE tx_size) {
 1362|  51.2M|  switch (tx_size) {
 1363|  7.81M|    case TX_64X64:
  ------------------
  |  Branch (1363:5): [True: 7.81M, False: 43.4M]
  ------------------
 1364|  8.55M|    case TX_64X32:
  ------------------
  |  Branch (1364:5): [True: 743k, False: 50.5M]
  ------------------
 1365|  9.15M|    case TX_32X64: return TX_32X32;
  ------------------
  |  Branch (1365:5): [True: 595k, False: 50.6M]
  ------------------
 1366|  40.6k|    case TX_64X16: return TX_32X16;
  ------------------
  |  Branch (1366:5): [True: 40.6k, False: 51.2M]
  ------------------
 1367|  51.8k|    case TX_16X64: return TX_16X32;
  ------------------
  |  Branch (1367:5): [True: 51.8k, False: 51.2M]
  ------------------
 1368|  43.0M|    default: return tx_size;
  ------------------
  |  Branch (1368:5): [True: 43.0M, False: 8.24M]
  ------------------
 1369|  51.2M|  }
 1370|  51.2M|}
av1_loopfilter.c:av1_get_txb_size_index:
 1207|  2.64M|                                         int blk_col) {
 1208|  2.64M|  static const uint8_t tw_w_log2_table[BLOCK_SIZES_ALL] = {
 1209|  2.64M|    0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 1, 1, 2, 2, 3,
 1210|  2.64M|  };
 1211|  2.64M|  static const uint8_t tw_h_log2_table[BLOCK_SIZES_ALL] = {
 1212|  2.64M|    0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 0, 2, 1, 3, 2,
 1213|  2.64M|  };
 1214|  2.64M|  static const uint8_t stride_log2_table[BLOCK_SIZES_ALL] = {
 1215|  2.64M|    0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 2, 2, 0, 1, 0, 1, 0, 1,
 1216|  2.64M|  };
 1217|  2.64M|  const int index =
 1218|  2.64M|      ((blk_row >> tw_h_log2_table[bsize]) << stride_log2_table[bsize]) +
 1219|  2.64M|      (blk_col >> tw_w_log2_table[bsize]);
 1220|  2.64M|  assert(index < INTER_TX_SIZE_BUF_LEN);
 1221|  2.65M|  return index;
 1222|  2.64M|}
av1_loopfilter.c:is_inter_block:
  372|   189M|static inline int is_inter_block(const MB_MODE_INFO *mbmi) {
  373|   190M|  return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME;
  ------------------
  |  Branch (373:10): [True: 18.4E, False: 190M]
  |  Branch (373:36): [True: 14.6M, False: 176M]
  ------------------
  374|   189M|}
av1_loopfilter.c:is_intrabc_block:
  345|   192M|static inline int is_intrabc_block(const MB_MODE_INFO *mbmi) {
  346|   192M|  return mbmi->use_intrabc;
  347|   192M|}
av1_loopfilter.c:get_plane_block_size:
 1188|   167M|                                              int subsampling_y) {
 1189|   167M|  assert(bsize < BLOCK_SIZES_ALL);
 1190|   169M|  assert(subsampling_x >= 0 && subsampling_x < 2);
 1191|   171M|  assert(subsampling_y >= 0 && subsampling_y < 2);
 1192|   174M|  return av1_ss_size_lookup[bsize][subsampling_x][subsampling_y];
 1193|   171M|}
blockd.c:is_inter_block:
  372|  17.6M|static inline int is_inter_block(const MB_MODE_INFO *mbmi) {
  373|  17.6M|  return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME;
  ------------------
  |  Branch (373:10): [True: 79.6k, False: 17.5M]
  |  Branch (373:36): [True: 18.4E, False: 17.5M]
  ------------------
  374|  17.6M|}
blockd.c:is_intrabc_block:
  345|  17.6M|static inline int is_intrabc_block(const MB_MODE_INFO *mbmi) {
  346|  17.6M|  return mbmi->use_intrabc;
  347|  17.6M|}
blockd.c:get_plane_block_size:
 1188|  16.7M|                                              int subsampling_y) {
 1189|  16.7M|  assert(bsize < BLOCK_SIZES_ALL);
 1190|  16.7M|  assert(subsampling_x >= 0 && subsampling_x < 2);
 1191|  16.7M|  assert(subsampling_y >= 0 && subsampling_y < 2);
 1192|  16.7M|  return av1_ss_size_lookup[bsize][subsampling_x][subsampling_y];
 1193|  16.7M|}
blockd.c:get_plane_type:
 1592|   462k|static inline PLANE_TYPE get_plane_type(int plane) {
 1593|   462k|  return (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
  ------------------
  |  Branch (1593:10): [True: 175k, False: 286k]
  ------------------
 1594|   462k|}
cdef.c:get_plane_type:
 1592|  2.20M|static inline PLANE_TYPE get_plane_type(int plane) {
 1593|  2.20M|  return (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
  ------------------
  |  Branch (1593:10): [True: 456k, False: 1.74M]
  ------------------
 1594|  2.20M|}
cfl.c:is_cur_buf_hbd:
  932|  6.01M|static inline int is_cur_buf_hbd(const MACROBLOCKD *xd) {
  933|  6.01M|#if CONFIG_AV1_HIGHBITDEPTH
  934|  6.01M|  return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
  ------------------
  |  |  142|  6.01M|#define YV12_FLAG_HIGHBITDEPTH 8
  ------------------
  |  Branch (934:10): [True: 3.21M, False: 2.79M]
  ------------------
  935|       |#else
  936|       |  (void)xd;
  937|       |  return 0;
  938|       |#endif
  939|  6.01M|}
cfl.c:get_plane_block_size:
 1188|  1.88k|                                              int subsampling_y) {
 1189|  1.88k|  assert(bsize < BLOCK_SIZES_ALL);
 1190|  1.88k|  assert(subsampling_x >= 0 && subsampling_x < 2);
 1191|  1.88k|  assert(subsampling_y >= 0 && subsampling_y < 2);
 1192|  1.88k|  return av1_ss_size_lookup[bsize][subsampling_x][subsampling_y];
 1193|  1.88k|}
idct.c:av1_get_max_eob:
 1596|  16.1M|static inline int av1_get_max_eob(TX_SIZE tx_size) {
 1597|  16.1M|  if (tx_size == TX_64X64 || tx_size == TX_64X32 || tx_size == TX_32X64) {
  ------------------
  |  Branch (1597:7): [True: 172k, False: 15.9M]
  |  Branch (1597:30): [True: 74.0k, False: 15.8M]
  |  Branch (1597:53): [True: 27.2k, False: 15.8M]
  ------------------
 1598|   274k|    return 1024;
 1599|   274k|  }
 1600|  15.8M|  if (tx_size == TX_16X64 || tx_size == TX_64X16) {
  ------------------
  |  Branch (1600:7): [True: 32.3k, False: 15.8M]
  |  Branch (1600:30): [True: 312k, False: 15.5M]
  ------------------
 1601|   345k|    return 512;
 1602|   345k|  }
 1603|  15.5M|  return tx_size_2d[tx_size];
 1604|  15.8M|}
idct.c:is_cur_buf_hbd:
  932|  16.1M|static inline int is_cur_buf_hbd(const MACROBLOCKD *xd) {
  933|  16.1M|#if CONFIG_AV1_HIGHBITDEPTH
  934|  16.1M|  return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
  ------------------
  |  |  142|  16.1M|#define YV12_FLAG_HIGHBITDEPTH 8
  ------------------
  |  Branch (934:10): [True: 9.18M, False: 6.94M]
  ------------------
  935|       |#else
  936|       |  (void)xd;
  937|       |  return 0;
  938|       |#endif
  939|  16.1M|}
idct.c:av1_get_ext_tx_set_type:
 1098|  16.1M|                                                int use_reduced_set) {
 1099|  16.1M|  const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
 1100|  16.1M|  if (tx_size_sqr_up > TX_32X32) return EXT_TX_SET_DCTONLY;
  ------------------
  |  Branch (1100:7): [True: 619k, False: 15.5M]
  ------------------
 1101|  15.5M|  if (tx_size_sqr_up == TX_32X32)
  ------------------
  |  Branch (1101:7): [True: 2.31M, False: 13.1M]
  ------------------
 1102|  2.31M|    return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DCTONLY;
  ------------------
  |  Branch (1102:12): [True: 533k, False: 1.77M]
  ------------------
 1103|  13.1M|  if (use_reduced_set)
  ------------------
  |  Branch (1103:7): [True: 3.63M, False: 9.55M]
  ------------------
 1104|  3.63M|    return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DTT4_IDTX;
  ------------------
  |  Branch (1104:12): [True: 1.61M, False: 2.02M]
  ------------------
 1105|  9.55M|  const TX_SIZE tx_size_sqr = txsize_sqr_map[tx_size];
 1106|  9.55M|  return av1_ext_tx_set_lookup[is_inter][tx_size_sqr == TX_16X16];
 1107|  13.1M|}
idct.c:is_inter_block:
  372|  16.1M|static inline int is_inter_block(const MB_MODE_INFO *mbmi) {
  373|  16.1M|  return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME;
  ------------------
  |  Branch (373:10): [True: 92.5k, False: 16.0M]
  |  Branch (373:36): [True: 5.88M, False: 10.1M]
  ------------------
  374|  16.1M|}
idct.c:is_intrabc_block:
  345|  16.1M|static inline int is_intrabc_block(const MB_MODE_INFO *mbmi) {
  346|  16.1M|  return mbmi->use_intrabc;
  347|  16.1M|}
mvref_common.c:is_inter_block:
  372|  29.7M|static inline int is_inter_block(const MB_MODE_INFO *mbmi) {
  373|  29.7M|  return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME;
  ------------------
  |  Branch (373:10): [True: 68.6k, False: 29.7M]
  |  Branch (373:36): [True: 25.8M, False: 3.83M]
  ------------------
  374|  29.7M|}
mvref_common.c:is_intrabc_block:
  345|  29.7M|static inline int is_intrabc_block(const MB_MODE_INFO *mbmi) {
  346|  29.7M|  return mbmi->use_intrabc;
  347|  29.7M|}
mvref_common.c:is_global_mv_block:
  422|  22.1M|                                     TransformationType type) {
  423|  22.1M|  const PREDICTION_MODE mode = mbmi->mode;
  424|  22.1M|  const BLOCK_SIZE bsize = mbmi->bsize;
  425|  22.1M|  const int block_size_allowed =
  426|  22.1M|      AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
  ------------------
  |  |   34|  22.1M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 5.40M, False: 16.7M]
  |  |  ------------------
  ------------------
  427|  22.1M|  return (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) && type > TRANSLATION &&
  ------------------
  |  Branch (427:11): [True: 4.50M, False: 17.6M]
  |  Branch (427:31): [True: 293k, False: 17.3M]
  |  Branch (427:59): [True: 1.05M, False: 3.74M]
  ------------------
  428|  22.1M|         block_size_allowed;
  ------------------
  |  Branch (428:10): [True: 813k, False: 242k]
  ------------------
  429|  22.1M|}
mvref_common.c:have_newmv_in_inter_mode:
  156|  20.8M|static inline int have_newmv_in_inter_mode(PREDICTION_MODE mode) {
  157|  20.8M|  return (mode == NEWMV || mode == NEW_NEWMV || mode == NEAREST_NEWMV ||
  ------------------
  |  Branch (157:11): [True: 9.27M, False: 11.5M]
  |  Branch (157:28): [True: 371k, False: 11.2M]
  |  Branch (157:49): [True: 102k, False: 11.1M]
  ------------------
  158|  20.8M|          mode == NEW_NEARESTMV || mode == NEAR_NEWMV || mode == NEW_NEARMV);
  ------------------
  |  Branch (158:11): [True: 94.5k, False: 11.0M]
  |  Branch (158:36): [True: 174k, False: 10.8M]
  |  Branch (158:58): [True: 58.3k, False: 10.7M]
  ------------------
  159|  20.8M|}
pred_common.c:is_inter_block:
  372|  12.3M|static inline int is_inter_block(const MB_MODE_INFO *mbmi) {
  373|  12.3M|  return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME;
  ------------------
  |  Branch (373:10): [True: 18.4E, False: 12.3M]
  |  Branch (373:36): [True: 8.70M, False: 3.61M]
  ------------------
  374|  12.3M|}
pred_common.c:is_intrabc_block:
  345|  12.3M|static inline int is_intrabc_block(const MB_MODE_INFO *mbmi) {
  346|  12.3M|  return mbmi->use_intrabc;
  347|  12.3M|}
pred_common.c:has_second_ref:
  376|  3.92M|static inline int has_second_ref(const MB_MODE_INFO *mbmi) {
  377|  3.92M|  return mbmi->ref_frame[1] > INTRA_FRAME;
  378|  3.92M|}
pred_common.c:has_uni_comp_refs:
  380|   645k|static inline int has_uni_comp_refs(const MB_MODE_INFO *mbmi) {
  381|   645k|  return has_second_ref(mbmi) && (!((mbmi->ref_frame[0] >= BWDREF_FRAME) ^
  ------------------
  |  Branch (381:10): [True: 645k, False: 229]
  |  Branch (381:34): [True: 93.4k, False: 552k]
  ------------------
  382|   645k|                                    (mbmi->ref_frame[1] >= BWDREF_FRAME)));
  383|   645k|}
quant_common.c:av1_get_adjusted_tx_size:
 1361|  35.7M|static inline TX_SIZE av1_get_adjusted_tx_size(TX_SIZE tx_size) {
 1362|  35.7M|  switch (tx_size) {
 1363|   959k|    case TX_64X64:
  ------------------
  |  Branch (1363:5): [True: 959k, False: 34.7M]
  ------------------
 1364|  1.81M|    case TX_64X32:
  ------------------
  |  Branch (1364:5): [True: 850k, False: 34.9M]
  ------------------
 1365|  2.61M|    case TX_32X64: return TX_32X32;
  ------------------
  |  Branch (1365:5): [True: 802k, False: 34.9M]
  ------------------
 1366|  1.09M|    case TX_64X16: return TX_32X16;
  ------------------
  |  Branch (1366:5): [True: 1.09M, False: 34.6M]
  ------------------
 1367|   808k|    case TX_16X64: return TX_16X32;
  ------------------
  |  Branch (1367:5): [True: 808k, False: 34.9M]
  ------------------
 1368|  31.2M|    default: return tx_size;
  ------------------
  |  Branch (1368:5): [True: 31.2M, False: 4.48M]
  ------------------
 1369|  35.7M|  }
 1370|  35.7M|}
reconinter.c:is_motion_variation_allowed_bsize:
 1455|  12.0M|static inline int is_motion_variation_allowed_bsize(BLOCK_SIZE bsize) {
 1456|  12.0M|  assert(bsize < BLOCK_SIZES_ALL);
 1457|  12.0M|  return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
  ------------------
  |  |   34|  12.0M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 2.85M, False: 9.17M]
  |  |  ------------------
  ------------------
 1458|  12.0M|}
reconinter.c:is_neighbor_overlappable:
 1494|  5.24M|static inline int is_neighbor_overlappable(const MB_MODE_INFO *mbmi) {
 1495|  5.24M|  return (is_inter_block(mbmi));
 1496|  5.24M|}
reconinter.c:is_inter_block:
  372|  5.24M|static inline int is_inter_block(const MB_MODE_INFO *mbmi) {
  373|  5.24M|  return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME;
  ------------------
  |  Branch (373:10): [True: 1.59k, False: 5.23M]
  |  Branch (373:36): [True: 4.62M, False: 616k]
  ------------------
  374|  5.24M|}
reconinter.c:is_intrabc_block:
  345|  5.23M|static inline int is_intrabc_block(const MB_MODE_INFO *mbmi) {
  346|  5.23M|  return mbmi->use_intrabc;
  347|  5.23M|}
reconinter.c:get_plane_block_size:
 1188|  9.39M|                                              int subsampling_y) {
 1189|  9.39M|  assert(bsize < BLOCK_SIZES_ALL);
 1190|  9.39M|  assert(subsampling_x >= 0 && subsampling_x < 2);
 1191|  9.39M|  assert(subsampling_y >= 0 && subsampling_y < 2);
 1192|  9.39M|  return av1_ss_size_lookup[bsize][subsampling_x][subsampling_y];
 1193|  9.39M|}
reconinter.c:is_cur_buf_hbd:
  932|  5.43M|static inline int is_cur_buf_hbd(const MACROBLOCKD *xd) {
  933|  5.43M|#if CONFIG_AV1_HIGHBITDEPTH
  934|  5.43M|  return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
  ------------------
  |  |  142|  5.43M|#define YV12_FLAG_HIGHBITDEPTH 8
  ------------------
  |  Branch (934:10): [True: 2.68M, False: 2.75M]
  ------------------
  935|       |#else
  936|       |  (void)xd;
  937|       |  return 0;
  938|       |#endif
  939|  5.43M|}
reconinter.c:has_second_ref:
  376|  1.24M|static inline int has_second_ref(const MB_MODE_INFO *mbmi) {
  377|  1.24M|  return mbmi->ref_frame[1] > INTRA_FRAME;
  378|  1.24M|}
reconintra.c:is_cur_buf_hbd:
  932|   113M|static inline int is_cur_buf_hbd(const MACROBLOCKD *xd) {
  933|   113M|#if CONFIG_AV1_HIGHBITDEPTH
  934|   113M|  return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
  ------------------
  |  |  142|   113M|#define YV12_FLAG_HIGHBITDEPTH 8
  ------------------
  |  Branch (934:10): [True: 95.6M, False: 17.8M]
  ------------------
  935|       |#else
  936|       |  (void)xd;
  937|       |  return 0;
  938|       |#endif
  939|   113M|}
reconintra.c:is_inter_block:
  372|  10.6M|static inline int is_inter_block(const MB_MODE_INFO *mbmi) {
  373|  10.6M|  return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME;
  ------------------
  |  Branch (373:10): [True: 39.2k, False: 10.5M]
  |  Branch (373:36): [True: 1.08M, False: 9.50M]
  ------------------
  374|  10.6M|}
reconintra.c:is_intrabc_block:
  345|  10.6M|static inline int is_intrabc_block(const MB_MODE_INFO *mbmi) {
  346|  10.6M|  return mbmi->use_intrabc;
  347|  10.6M|}
reconintra.c:get_uv_mode:
  349|  73.4M|static inline PREDICTION_MODE get_uv_mode(UV_PREDICTION_MODE mode) {
  350|  73.4M|  assert(mode < UV_INTRA_MODES);
  351|  73.4M|  static const PREDICTION_MODE uv2y[] = {
  352|  73.4M|    DC_PRED,        // UV_DC_PRED
  353|  73.4M|    V_PRED,         // UV_V_PRED
  354|  73.4M|    H_PRED,         // UV_H_PRED
  355|  73.4M|    D45_PRED,       // UV_D45_PRED
  356|  73.4M|    D135_PRED,      // UV_D135_PRED
  357|  73.4M|    D113_PRED,      // UV_D113_PRED
  358|  73.4M|    D157_PRED,      // UV_D157_PRED
  359|  73.4M|    D203_PRED,      // UV_D203_PRED
  360|  73.4M|    D67_PRED,       // UV_D67_PRED
  361|  73.4M|    SMOOTH_PRED,    // UV_SMOOTH_PRED
  362|  73.4M|    SMOOTH_V_PRED,  // UV_SMOOTH_V_PRED
  363|  73.4M|    SMOOTH_H_PRED,  // UV_SMOOTH_H_PRED
  364|  73.4M|    PAETH_PRED,     // UV_PAETH_PRED
  365|  73.4M|    DC_PRED,        // UV_CFL_PRED
  366|  73.4M|    INTRA_INVALID,  // UV_INTRA_MODES
  367|  73.4M|    INTRA_INVALID,  // UV_MODE_INVALID
  368|  73.4M|  };
  369|  73.4M|  return uv2y[mode];
  370|  73.4M|}

av1_cdef_compute_sb_list:
   43|   639k|                             BLOCK_SIZE bs) {
   44|   639k|  MB_MODE_INFO **grid = mi_params->mi_grid_base;
   45|   639k|  int maxc = mi_params->mi_cols - mi_col;
   46|   639k|  int maxr = mi_params->mi_rows - mi_row;
   47|       |
   48|   639k|  if (bs == BLOCK_128X128 || bs == BLOCK_128X64)
  ------------------
  |  Branch (48:7): [True: 18.4E, False: 639k]
  |  Branch (48:30): [True: 18.4E, False: 639k]
  ------------------
   49|      0|    maxc = AOMMIN(maxc, MI_SIZE_128X128);
  ------------------
  |  |   34|      0|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
   50|   639k|  else
   51|   639k|    maxc = AOMMIN(maxc, MI_SIZE_64X64);
  ------------------
  |  |   34|   639k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 46.4k, False: 593k]
  |  |  ------------------
  ------------------
   52|   639k|  if (bs == BLOCK_128X128 || bs == BLOCK_64X128)
  ------------------
  |  Branch (52:7): [True: 18.4E, False: 639k]
  |  Branch (52:30): [True: 11, False: 639k]
  ------------------
   53|      0|    maxr = AOMMIN(maxr, MI_SIZE_128X128);
  ------------------
  |  |   34|      0|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
   54|   639k|  else
   55|   639k|    maxr = AOMMIN(maxr, MI_SIZE_64X64);
  ------------------
  |  |   34|   639k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 72.4k, False: 567k]
  |  |  ------------------
  ------------------
   56|       |
   57|   639k|  const int r_step = 2;  // mi_size_high[BLOCK_8X8]
   58|   639k|  const int c_step = 2;  // mi_size_wide[BLOCK_8X8]
   59|   639k|  const int r_shift = 1;
   60|   639k|  const int c_shift = 1;
   61|   639k|  int count = 0;
   62|  5.44M|  for (int r = 0; r < maxr; r += r_step) {
  ------------------
  |  Branch (62:19): [True: 4.80M, False: 639k]
  ------------------
   63|  41.4M|    for (int c = 0; c < maxc; c += c_step) {
  ------------------
  |  Branch (63:21): [True: 36.6M, False: 4.80M]
  ------------------
   64|  36.6M|      if (!is_8x8_block_skip(grid, mi_row + r, mi_col + c,
  ------------------
  |  Branch (64:11): [True: 27.2M, False: 9.41M]
  ------------------
   65|  36.6M|                             mi_params->mi_stride)) {
   66|  27.2M|        dlist[count].by = r >> r_shift;
   67|  27.2M|        dlist[count].bx = c >> c_shift;
   68|  27.2M|        count++;
   69|  27.2M|      }
   70|  36.6M|    }
   71|  4.80M|  }
   72|   639k|  return count;
   73|   639k|}
av1_cdef_copy_sb8_16_lowbd:
  100|   868k|                                int hsize) {
  101|   868k|  const uint8_t *base = &src[src_voffset * (ptrdiff_t)sstride + src_hoffset];
  102|   868k|  cdef_copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, hsize, vsize);
  103|   868k|}
av1_cdef_copy_sb8_16_highbd:
  109|   809k|                                 int hsize) {
  110|   809k|  const uint16_t *base =
  111|   809k|      &CONVERT_TO_SHORTPTR(src)[src_voffset * (ptrdiff_t)sstride + src_hoffset];
  ------------------
  |  |   75|   809k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  112|   809k|  cdef_copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, hsize, vsize);
  113|   809k|}
av1_cdef_copy_sb8_16:
  118|  1.67M|                          int src_hoffset, int sstride, int vsize, int hsize) {
  119|  1.67M|#if CONFIG_AV1_HIGHBITDEPTH
  120|  1.67M|  if (cm->seq_params->use_highbitdepth) {
  ------------------
  |  Branch (120:7): [True: 809k, False: 868k]
  ------------------
  121|   809k|    av1_cdef_copy_sb8_16_highbd(dst, dstride, src, src_voffset, src_hoffset,
  122|   809k|                                sstride, vsize, hsize);
  123|   809k|    return;
  124|   809k|  }
  125|       |#else
  126|       |  (void)cm;
  127|       |#endif  // CONFIG_AV1_HIGHBITDEPTH
  128|   868k|  av1_cdef_copy_sb8_16_lowbd(dst, dstride, src, src_voffset, src_hoffset,
  129|   868k|                             sstride, vsize, hsize);
  130|   868k|}
av1_cdef_init_fb_row:
  373|  19.1k|                          struct AV1CdefSyncData *const cdef_sync, int fbr) {
  374|  19.1k|  (void)cdef_sync;
  375|  19.1k|  const int num_planes = av1_num_planes(cm);
  376|  19.1k|  const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  19.1k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  19.1k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
                const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  19.1k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  19.1k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  377|  19.1k|  const int luma_stride =
  378|  19.1k|      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4);
  ------------------
  |  |   69|  19.1k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
  379|  19.1k|  const bool ping_pong = fbr & 1;
  380|       |  // for the current filter block, it's top left corner mi structure (mi_tl)
  381|       |  // is first accessed to check whether the top and left boundaries are
  382|       |  // frame boundaries. Then bottom-left and top-right mi structures are
  383|       |  // accessed to check whether the bottom and right boundaries
  384|       |  // (respectively) are frame boundaries.
  385|       |  //
  386|       |  // Note that we can't just check the bottom-right mi structure - eg. if
  387|       |  // we're at the right-hand edge of the frame but not the bottom, then
  388|       |  // the bottom-right mi is NULL but the bottom-left is not.
  389|  19.1k|  fb_info->frame_boundary[TOP] = (MI_SIZE_64X64 * fbr == 0) ? 1 : 0;
  ------------------
  |  |   58|  19.1k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  19.1k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  |  Branch (389:34): [True: 6.00k, False: 13.1k]
  ------------------
  390|  19.1k|  if (fbr != nvfb - 1)
  ------------------
  |  Branch (390:7): [True: 13.1k, False: 6.00k]
  ------------------
  391|  13.1k|    fb_info->frame_boundary[BOTTOM] =
  392|  13.1k|        (MI_SIZE_64X64 * (fbr + 1) == cm->mi_params.mi_rows) ? 1 : 0;
  ------------------
  |  |   58|  13.1k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  13.1k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  |  Branch (392:9): [True: 0, False: 13.1k]
  ------------------
  393|  6.00k|  else
  394|  6.00k|    fb_info->frame_boundary[BOTTOM] = 1;
  395|       |
  396|  19.1k|  fb_info->src = src;
  397|  19.1k|  fb_info->damping = cm->cdef_info.cdef_damping;
  398|  19.1k|  fb_info->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0);
  ------------------
  |  |   35|  19.1k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 9.75k, False: 9.39k]
  |  |  ------------------
  ------------------
  399|  19.1k|  av1_zero(fb_info->dir);
  ------------------
  |  |   43|  19.1k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
  400|  19.1k|  av1_zero(fb_info->var);
  ------------------
  |  |   43|  19.1k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
  401|       |
  402|  64.2k|  for (int plane = 0; plane < num_planes; plane++) {
  ------------------
  |  Branch (402:23): [True: 45.1k, False: 19.1k]
  ------------------
  403|  45.1k|    const int mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y;
  ------------------
  |  |   39|  45.1k|#define MI_SIZE_LOG2 2
  ------------------
  404|  45.1k|    const int offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2;
  ------------------
  |  |   58|  45.1k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  45.1k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  405|  45.1k|    const int stride = luma_stride >> xd->plane[plane].subsampling_x;
  406|       |    // here ping-pong buffers are maintained for top linebuf
  407|       |    // to avoid linebuf over-write by consecutive row.
  408|  45.1k|    uint16_t *const top_linebuf =
  409|  45.1k|        &linebuf[plane][ping_pong * CDEF_VBORDER * stride];
  ------------------
  |  |   23|  45.1k|#define CDEF_VBORDER (2)
  ------------------
  410|  45.1k|    fb_info->bot_linebuf[plane] = &linebuf[plane][(CDEF_VBORDER << 1) * stride];
  ------------------
  |  |   23|  45.1k|#define CDEF_VBORDER (2)
  ------------------
  411|       |
  412|  45.1k|    if (fbr != nvfb - 1)  // top line buffer copy
  ------------------
  |  Branch (412:9): [True: 32.2k, False: 12.8k]
  ------------------
  413|  32.2k|      av1_cdef_copy_sb8_16(cm, top_linebuf, stride, xd->plane[plane].dst.buf,
  414|  32.2k|                           offset - CDEF_VBORDER, 0,
  ------------------
  |  |   23|  32.2k|#define CDEF_VBORDER (2)
  ------------------
  415|  32.2k|                           xd->plane[plane].dst.stride, CDEF_VBORDER, stride);
  ------------------
  |  |   23|  32.2k|#define CDEF_VBORDER (2)
  ------------------
  416|  45.1k|    fb_info->top_linebuf[plane] =
  417|  45.1k|        &linebuf[plane][(!ping_pong) * CDEF_VBORDER * stride];
  ------------------
  |  |   23|  45.1k|#define CDEF_VBORDER (2)
  ------------------
  418|       |
  419|  45.1k|    if (fbr != nvfb - 1)  // bottom line buffer copy
  ------------------
  |  Branch (419:9): [True: 32.2k, False: 12.8k]
  ------------------
  420|  32.2k|      av1_cdef_copy_sb8_16(cm, fb_info->bot_linebuf[plane], stride,
  421|  32.2k|                           xd->plane[plane].dst.buf, offset, 0,
  422|  32.2k|                           xd->plane[plane].dst.stride, CDEF_VBORDER, stride);
  ------------------
  |  |   23|  32.2k|#define CDEF_VBORDER (2)
  ------------------
  423|  45.1k|  }
  424|  19.1k|}
av1_cdef_fb_row:
  431|  88.5k|                     struct aom_internal_error_info *error_info) {
  432|       |  // TODO(aomedia:3276): Pass error_info to the low-level functions as required
  433|       |  // in future to handle error propagation.
  434|  88.5k|  (void)error_info;
  435|  88.5k|  CdefBlockInfo fb_info;
  436|  88.5k|  int cdef_left[MAX_MB_PLANE] = { 1, 1, 1 };
  437|  88.5k|  const int nhfb = (cm->mi_params.mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  88.5k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  88.5k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
                const int nhfb = (cm->mi_params.mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  88.5k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  88.5k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  438|       |
  439|  88.5k|  cdef_init_fb_row_fn(cm, xd, &fb_info, linebuf, src, cdef_sync, fbr);
  440|  88.5k|#if CONFIG_MULTITHREAD
  441|  88.5k|  if (cdef_sync && cm->cdef_info.allocated_num_workers > 1) {
  ------------------
  |  Branch (441:7): [True: 69.3k, False: 19.1k]
  |  Branch (441:20): [True: 69.3k, False: 9]
  ------------------
  442|  69.3k|    pthread_mutex_lock(cdef_sync->mutex_);
  443|  69.3k|    const bool cdef_mt_exit = cdef_sync->cdef_mt_exit;
  444|  69.3k|    pthread_mutex_unlock(cdef_sync->mutex_);
  445|       |    // Exit in case any worker has encountered an error.
  446|  69.3k|    if (cdef_mt_exit) return;
  ------------------
  |  Branch (446:9): [True: 0, False: 69.3k]
  ------------------
  447|  69.3k|  }
  448|  88.5k|#endif
  449|   735k|  for (int fbc = 0; fbc < nhfb; fbc++) {
  ------------------
  |  Branch (449:21): [True: 647k, False: 88.5k]
  ------------------
  450|   647k|    fb_info.frame_boundary[LEFT] = (MI_SIZE_64X64 * fbc == 0) ? 1 : 0;
  ------------------
  |  |   58|   647k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   647k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  |  Branch (450:36): [True: 88.1k, False: 559k]
  ------------------
  451|   647k|    if (fbc != nhfb - 1)
  ------------------
  |  Branch (451:9): [True: 559k, False: 87.9k]
  ------------------
  452|   559k|      fb_info.frame_boundary[RIGHT] =
  453|   559k|          (MI_SIZE_64X64 * (fbc + 1) == cm->mi_params.mi_cols) ? 1 : 0;
  ------------------
  |  |   58|   559k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   559k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  |  Branch (453:11): [True: 0, False: 559k]
  ------------------
  454|  87.9k|    else
  455|  87.9k|      fb_info.frame_boundary[RIGHT] = 1;
  456|   647k|    cdef_fb_col(cm, xd, &fb_info, colbuf, &cdef_left[0], fbc, fbr);
  457|   647k|  }
  458|  88.5k|}
av1_cdef_frame:
  468|  6.00k|                    MACROBLOCKD *xd, cdef_init_fb_row_t cdef_init_fb_row_fn) {
  469|  6.00k|  const int num_planes = av1_num_planes(cm);
  470|  6.00k|  const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  6.00k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  6.00k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
                const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  6.00k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  6.00k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  471|       |
  472|  6.00k|  av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0,
  473|  6.00k|                       num_planes);
  474|       |
  475|  25.1k|  for (int fbr = 0; fbr < nvfb; fbr++)
  ------------------
  |  Branch (475:21): [True: 19.1k, False: 6.00k]
  ------------------
  476|  19.1k|    av1_cdef_fb_row(cm, xd, cm->cdef_info.linebuf, cm->cdef_info.colbuf,
  477|  19.1k|                    cm->cdef_info.srcbuf, fbr, cdef_init_fb_row_fn, NULL,
  478|  19.1k|                    xd->error_info);
  479|  6.00k|}
cdef.c:is_8x8_block_skip:
   30|  36.6M|                             int mi_stride) {
   31|  36.6M|  MB_MODE_INFO **mbmi = grid + mi_row * mi_stride + mi_col;
   32|  55.4M|  for (int r = 0; r < mi_size_high[BLOCK_8X8]; ++r, mbmi += mi_stride) {
  ------------------
  |  Branch (32:19): [True: 46.0M, False: 9.41M]
  ------------------
   33|  83.7M|    for (int c = 0; c < mi_size_wide[BLOCK_8X8]; ++c) {
  ------------------
  |  Branch (33:21): [True: 64.9M, False: 18.8M]
  ------------------
   34|  64.9M|      if (!mbmi[c]->skip_txfm) return 0;
  ------------------
  |  Branch (34:11): [True: 27.2M, False: 37.7M]
  ------------------
   35|  64.9M|    }
   36|  46.0M|  }
   37|       |
   38|  9.41M|  return 1;
   39|  36.6M|}
cdef.c:cdef_fb_col:
  302|   647k|                        int *cdef_left, int fbc, int fbr) {
  303|   647k|  const CommonModeInfoParams *const mi_params = &cm->mi_params;
  304|   647k|  const int mbmi_cdef_strength =
  305|   647k|      mi_params
  306|   647k|          ->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
  ------------------
  |  |   58|   647k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   647k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  307|   647k|                         MI_SIZE_64X64 * fbc]
  ------------------
  |  |   58|   647k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   647k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  308|   647k|          ->cdef_strength;
  309|   647k|  const int num_planes = av1_num_planes(cm);
  310|   647k|  int is_zero_level[PLANE_TYPES] = { 1, 1 };
  311|   647k|  int level[PLANE_TYPES] = { 0 };
  312|   647k|  int sec_strength[PLANE_TYPES] = { 0 };
  313|   647k|  const CdefInfo *const cdef_info = &cm->cdef_info;
  314|       |
  315|   647k|  if (mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
  ------------------
  |  |   58|   647k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   647k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  |  Branch (315:7): [True: 18.4E, False: 647k]
  ------------------
  316|   647k|                              MI_SIZE_64X64 * fbc] == NULL ||
  ------------------
  |  |   58|   647k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   647k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  317|   647k|      mbmi_cdef_strength == -1) {
  ------------------
  |  Branch (317:7): [True: 18.4E, False: 647k]
  ------------------
  318|      0|    av1_zero_array(cdef_left, num_planes);
  ------------------
  |  |   44|      0|#define av1_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest)))
  ------------------
  319|      0|    return;
  320|      0|  }
  321|       |
  322|       |  // Compute level and secondary strength for planes
  323|   647k|  level[PLANE_TYPE_Y] =
  324|   647k|      cdef_info->cdef_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
  ------------------
  |  |   17|   647k|#define CDEF_SEC_STRENGTHS 4
  ------------------
  325|   647k|  sec_strength[PLANE_TYPE_Y] =
  326|   647k|      cdef_info->cdef_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
  ------------------
  |  |   17|   647k|#define CDEF_SEC_STRENGTHS 4
  ------------------
  327|   647k|  sec_strength[PLANE_TYPE_Y] += sec_strength[PLANE_TYPE_Y] == 3;
  328|   647k|  is_zero_level[PLANE_TYPE_Y] =
  329|   647k|      (level[PLANE_TYPE_Y] == 0) && (sec_strength[PLANE_TYPE_Y] == 0);
  ------------------
  |  Branch (329:7): [True: 212k, False: 434k]
  |  Branch (329:37): [True: 143k, False: 68.8k]
  ------------------
  330|       |
  331|   647k|  if (num_planes > 1) {
  ------------------
  |  Branch (331:7): [True: 618k, False: 28.2k]
  ------------------
  332|   618k|    level[PLANE_TYPE_UV] =
  333|   618k|        cdef_info->cdef_uv_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
  ------------------
  |  |   17|   618k|#define CDEF_SEC_STRENGTHS 4
  ------------------
  334|   618k|    sec_strength[PLANE_TYPE_UV] =
  335|   618k|        cdef_info->cdef_uv_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
  ------------------
  |  |   17|   618k|#define CDEF_SEC_STRENGTHS 4
  ------------------
  336|   618k|    sec_strength[PLANE_TYPE_UV] += sec_strength[PLANE_TYPE_UV] == 3;
  337|   618k|    is_zero_level[PLANE_TYPE_UV] =
  338|   618k|        (level[PLANE_TYPE_UV] == 0) && (sec_strength[PLANE_TYPE_UV] == 0);
  ------------------
  |  Branch (338:9): [True: 31.7k, False: 587k]
  |  Branch (338:40): [True: 18.4k, False: 13.3k]
  ------------------
  339|   618k|  }
  340|       |
  341|   647k|  if (is_zero_level[PLANE_TYPE_Y] && is_zero_level[PLANE_TYPE_UV]) {
  ------------------
  |  Branch (341:7): [True: 143k, False: 503k]
  |  Branch (341:38): [True: 7.93k, False: 135k]
  ------------------
  342|  7.93k|    av1_zero_array(cdef_left, num_planes);
  ------------------
  |  |   44|  7.93k|#define av1_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest)))
  ------------------
  343|  7.93k|    return;
  344|  7.93k|  }
  345|       |
  346|   639k|  fb_info->cdef_count = av1_cdef_compute_sb_list(mi_params, fbr * MI_SIZE_64X64,
  ------------------
  |  |   58|   639k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   639k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  347|   639k|                                                 fbc * MI_SIZE_64X64,
  ------------------
  |  |   58|   639k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   639k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  348|   639k|                                                 fb_info->dlist, BLOCK_64X64);
  349|   639k|  if (!fb_info->cdef_count) {
  ------------------
  |  Branch (349:7): [True: 182k, False: 456k]
  ------------------
  350|   182k|    av1_zero_array(cdef_left, num_planes);
  ------------------
  |  |   44|   182k|#define av1_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest)))
  ------------------
  351|   182k|    return;
  352|   182k|  }
  353|       |
  354|  1.79M|  for (int plane = 0; plane < num_planes; plane++) {
  ------------------
  |  Branch (354:23): [True: 1.33M, False: 456k]
  ------------------
  355|       |    // Do not skip cdef filtering for luma plane as filter direction is
  356|       |    // computed based on luma.
  357|  1.33M|    if (plane && is_zero_level[get_plane_type(plane)]) {
  ------------------
  |  Branch (357:9): [True: 879k, False: 455k]
  |  Branch (357:18): [True: 7.15k, False: 871k]
  ------------------
  358|  7.15k|      cdef_left[plane] = 0;
  359|  7.15k|      continue;
  360|  7.15k|    }
  361|  1.32M|    cdef_init_fb_col(xd, fb_info, level, sec_strength, fbc, fbr, plane);
  362|  1.32M|    cdef_prepare_fb(cm, fb_info, colbuf, cdef_left[plane], fbc, fbr, plane);
  363|  1.32M|    cdef_filter_fb(fb_info, plane, cm->seq_params->use_highbitdepth);
  364|  1.32M|    cdef_left[plane] = 1;
  365|  1.32M|  }
  366|   456k|}
cdef.c:cdef_init_fb_col:
  285|  1.32M|                                    int plane) {
  286|  1.32M|  const PLANE_TYPE plane_type = get_plane_type(plane);
  287|  1.32M|  fb_info->level = level[plane_type];
  288|  1.32M|  fb_info->sec_strength = sec_strength[plane_type];
  289|  1.32M|  fb_info->dst = xd->plane[plane].dst.buf;
  290|  1.32M|  fb_info->dst_stride = xd->plane[plane].dst.stride;
  291|       |
  292|  1.32M|  fb_info->xdec = xd->plane[plane].subsampling_x;
  293|  1.32M|  fb_info->ydec = xd->plane[plane].subsampling_y;
  294|  1.32M|  fb_info->mi_wide_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_x;
  ------------------
  |  |   39|  1.32M|#define MI_SIZE_LOG2 2
  ------------------
  295|  1.32M|  fb_info->mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y;
  ------------------
  |  |   39|  1.32M|#define MI_SIZE_LOG2 2
  ------------------
  296|  1.32M|  fb_info->roffset = MI_SIZE_64X64 * fbr << fb_info->mi_high_l2;
  ------------------
  |  |   58|  1.32M|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  1.32M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  297|  1.32M|  fb_info->coffset = MI_SIZE_64X64 * fbc << fb_info->mi_wide_l2;
  ------------------
  |  |   58|  1.32M|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  1.32M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  298|  1.32M|}
cdef.c:cdef_prepare_fb:
  153|  1.32M|                            int fbc, int fbr, int plane) {
  154|  1.32M|  const CommonModeInfoParams *const mi_params = &cm->mi_params;
  155|  1.32M|  uint16_t *src = fb_info->src;
  156|  1.32M|  const int luma_stride =
  157|  1.32M|      ALIGN_POWER_OF_TWO(mi_params->mi_cols << MI_SIZE_LOG2, 4);
  ------------------
  |  |   69|  1.32M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
  158|  1.32M|  const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  1.32M|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  1.32M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
                const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  1.32M|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  1.32M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  159|  1.32M|  const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  1.32M|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  1.32M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
                const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  1.32M|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  1.32M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  160|  1.32M|  int cstart = 0;
  161|  1.32M|  if (!cdef_left) cstart = -CDEF_HBORDER;
  ------------------
  |  |   26|  22.3k|#define CDEF_HBORDER (8)
  ------------------
  |  Branch (161:7): [True: 22.3k, False: 1.30M]
  ------------------
  162|  1.32M|  int rend, cend;
  163|  1.32M|  const int nhb =
  164|  1.32M|      AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
  ------------------
  |  |   34|  1.32M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.17M, False: 149k]
  |  |  ------------------
  ------------------
  165|  1.32M|  const int nvb =
  166|  1.32M|      AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
  ------------------
  |  |   34|  1.32M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.18M, False: 142k]
  |  |  ------------------
  ------------------
  167|  1.32M|  const int hsize = nhb << fb_info->mi_wide_l2;
  168|  1.32M|  const int vsize = nvb << fb_info->mi_high_l2;
  169|  1.32M|  const uint16_t *top_linebuf = fb_info->top_linebuf[plane];
  170|  1.32M|  const uint16_t *bot_linebuf = fb_info->bot_linebuf[plane];
  171|  1.32M|  const int bot_offset = (vsize + CDEF_VBORDER) * CDEF_BSTRIDE;
  ------------------
  |  |   23|  1.32M|#define CDEF_VBORDER (2)
  ------------------
                const int bot_offset = (vsize + CDEF_VBORDER) * CDEF_BSTRIDE;
  ------------------
  |  |   28|  1.32M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.32M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  172|  1.32M|  const int stride =
  173|  1.32M|      luma_stride >> (plane == AOM_PLANE_Y ? 0 : cm->seq_params->subsampling_x);
  ------------------
  |  |  226|  1.32M|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  |  Branch (173:23): [True: 456k, False: 871k]
  ------------------
  174|       |
  175|  1.32M|  if (fbc == nhfb - 1)
  ------------------
  |  Branch (175:7): [True: 149k, False: 1.17M]
  ------------------
  176|   149k|    cend = hsize;
  177|  1.17M|  else
  178|  1.17M|    cend = hsize + CDEF_HBORDER;
  ------------------
  |  |   26|  1.17M|#define CDEF_HBORDER (8)
  ------------------
  179|       |
  180|  1.32M|  if (fbr == nvfb - 1)
  ------------------
  |  Branch (180:7): [True: 143k, False: 1.18M]
  ------------------
  181|   143k|    rend = vsize;
  182|  1.18M|  else
  183|  1.18M|    rend = vsize + CDEF_VBORDER;
  ------------------
  |  |   23|  1.18M|#define CDEF_VBORDER (2)
  ------------------
  184|       |
  185|       |  /* Copy in the pixels we need from the current superblock for
  186|       |  deringing.*/
  187|  1.32M|  av1_cdef_copy_sb8_16(
  188|  1.32M|      cm, &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER + cstart],
  ------------------
  |  |   23|  1.32M|#define CDEF_VBORDER (2)
  ------------------
                    cm, &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER + cstart],
  ------------------
  |  |   28|  1.32M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.32M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
                    cm, &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER + cstart],
  ------------------
  |  |   26|  1.32M|#define CDEF_HBORDER (8)
  ------------------
  189|  1.32M|      CDEF_BSTRIDE, fb_info->dst, fb_info->roffset, fb_info->coffset + cstart,
  ------------------
  |  |   28|  1.32M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.32M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  190|  1.32M|      fb_info->dst_stride, vsize, cend - cstart);
  191|       |
  192|       |  /* Copy in the pixels we need for the current superblock from bottom buffer.*/
  193|  1.32M|  if (fbr < nvfb - 1) {
  ------------------
  |  Branch (193:7): [True: 1.18M, False: 143k]
  ------------------
  194|  1.18M|    copy_rect(&src[bot_offset + CDEF_HBORDER], CDEF_BSTRIDE,
  ------------------
  |  |   26|  1.18M|#define CDEF_HBORDER (8)
  ------------------
                  copy_rect(&src[bot_offset + CDEF_HBORDER], CDEF_BSTRIDE,
  ------------------
  |  |   28|  1.18M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.18M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  195|  1.18M|              &bot_linebuf[fb_info->coffset], stride, CDEF_VBORDER, hsize);
  ------------------
  |  |   23|  1.18M|#define CDEF_VBORDER (2)
  ------------------
  196|  1.18M|  } else {
  197|   143k|    fill_rect(&src[bot_offset + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER,
  ------------------
  |  |   26|   143k|#define CDEF_HBORDER (8)
  ------------------
                  fill_rect(&src[bot_offset + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER,
  ------------------
  |  |   28|   143k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   143k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
                  fill_rect(&src[bot_offset + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER,
  ------------------
  |  |   23|   143k|#define CDEF_VBORDER (2)
  ------------------
  198|   143k|              hsize, CDEF_VERY_LARGE);
  ------------------
  |  |   30|   143k|#define CDEF_VERY_LARGE (0x4000)
  ------------------
  199|   143k|  }
  200|  1.32M|  if (fbr < nvfb - 1 && fbc > 0) {
  ------------------
  |  Branch (200:7): [True: 1.18M, False: 139k]
  |  Branch (200:25): [True: 1.08M, False: 104k]
  ------------------
  201|  1.08M|    copy_rect(&src[bot_offset], CDEF_BSTRIDE,
  ------------------
  |  |   28|  1.08M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.08M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  202|  1.08M|              &bot_linebuf[fb_info->coffset - CDEF_HBORDER], stride,
  ------------------
  |  |   26|  1.08M|#define CDEF_HBORDER (8)
  ------------------
  203|  1.08M|              CDEF_VBORDER, CDEF_HBORDER);
  ------------------
  |  |   23|  1.08M|#define CDEF_VBORDER (2)
  ------------------
                            CDEF_VBORDER, CDEF_HBORDER);
  ------------------
  |  |   26|  1.08M|#define CDEF_HBORDER (8)
  ------------------
  204|  1.08M|  } else {
  205|   244k|    fill_rect(&src[bot_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER,
  ------------------
  |  |   28|   244k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   244k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
                  fill_rect(&src[bot_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER,
  ------------------
  |  |   23|   244k|#define CDEF_VBORDER (2)
  ------------------
                  fill_rect(&src[bot_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER,
  ------------------
  |  |   26|   244k|#define CDEF_HBORDER (8)
  ------------------
  206|   244k|              CDEF_VERY_LARGE);
  ------------------
  |  |   30|   244k|#define CDEF_VERY_LARGE (0x4000)
  ------------------
  207|   244k|  }
  208|  1.32M|  if (fbr < nvfb - 1 && fbc < nhfb - 1) {
  ------------------
  |  Branch (208:7): [True: 1.18M, False: 139k]
  |  Branch (208:25): [True: 1.08M, False: 105k]
  ------------------
  209|  1.08M|    copy_rect(&src[bot_offset + hsize + CDEF_HBORDER], CDEF_BSTRIDE,
  ------------------
  |  |   26|  1.08M|#define CDEF_HBORDER (8)
  ------------------
                  copy_rect(&src[bot_offset + hsize + CDEF_HBORDER], CDEF_BSTRIDE,
  ------------------
  |  |   28|  1.08M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.08M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  210|  1.08M|              &bot_linebuf[fb_info->coffset + hsize], stride, CDEF_VBORDER,
  ------------------
  |  |   23|  1.08M|#define CDEF_VBORDER (2)
  ------------------
  211|  1.08M|              CDEF_HBORDER);
  ------------------
  |  |   26|  1.08M|#define CDEF_HBORDER (8)
  ------------------
  212|  1.08M|  } else {
  213|   244k|    fill_rect(&src[bot_offset + hsize + CDEF_HBORDER], CDEF_BSTRIDE,
  ------------------
  |  |   26|   244k|#define CDEF_HBORDER (8)
  ------------------
                  fill_rect(&src[bot_offset + hsize + CDEF_HBORDER], CDEF_BSTRIDE,
  ------------------
  |  |   28|   244k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   244k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  214|   244k|              CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
  ------------------
  |  |   23|   244k|#define CDEF_VBORDER (2)
  ------------------
                            CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
  ------------------
  |  |   26|   244k|#define CDEF_HBORDER (8)
  ------------------
                            CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
  ------------------
  |  |   30|   244k|#define CDEF_VERY_LARGE (0x4000)
  ------------------
  215|   244k|  }
  216|       |
  217|       |  /* Copy in the pixels we need from the current superblock from top buffer.*/
  218|  1.32M|  if (fbr > 0) {
  ------------------
  |  Branch (218:7): [True: 1.20M, False: 125k]
  ------------------
  219|  1.20M|    copy_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, &top_linebuf[fb_info->coffset],
  ------------------
  |  |   26|  1.20M|#define CDEF_HBORDER (8)
  ------------------
                  copy_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, &top_linebuf[fb_info->coffset],
  ------------------
  |  |   28|  1.20M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.20M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  220|  1.20M|              stride, CDEF_VBORDER, hsize);
  ------------------
  |  |   23|  1.20M|#define CDEF_VBORDER (2)
  ------------------
  221|  1.20M|  } else {
  222|   125k|    fill_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hsize,
  ------------------
  |  |   26|   125k|#define CDEF_HBORDER (8)
  ------------------
                  fill_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hsize,
  ------------------
  |  |   28|   125k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   125k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
                  fill_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hsize,
  ------------------
  |  |   23|   125k|#define CDEF_VBORDER (2)
  ------------------
  223|   125k|              CDEF_VERY_LARGE);
  ------------------
  |  |   30|   125k|#define CDEF_VERY_LARGE (0x4000)
  ------------------
  224|   125k|  }
  225|  1.32M|  if (fbr > 0 && fbc > 0) {
  ------------------
  |  Branch (225:7): [True: 1.20M, False: 125k]
  |  Branch (225:18): [True: 1.09M, False: 103k]
  ------------------
  226|  1.09M|    copy_rect(src, CDEF_BSTRIDE, &top_linebuf[fb_info->coffset - CDEF_HBORDER],
  ------------------
  |  |   28|  1.09M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.09M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
                  copy_rect(src, CDEF_BSTRIDE, &top_linebuf[fb_info->coffset - CDEF_HBORDER],
  ------------------
  |  |   26|  1.09M|#define CDEF_HBORDER (8)
  ------------------
  227|  1.09M|              stride, CDEF_VBORDER, CDEF_HBORDER);
  ------------------
  |  |   23|  1.09M|#define CDEF_VBORDER (2)
  ------------------
                            stride, CDEF_VBORDER, CDEF_HBORDER);
  ------------------
  |  |   26|  1.09M|#define CDEF_HBORDER (8)
  ------------------
  228|  1.09M|  } else {
  229|   229k|    fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
  ------------------
  |  |   28|   229k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   229k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
                  fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
  ------------------
  |  |   23|   229k|#define CDEF_VBORDER (2)
  ------------------
                  fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
  ------------------
  |  |   26|   229k|#define CDEF_HBORDER (8)
  ------------------
                  fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
  ------------------
  |  |   30|   229k|#define CDEF_VERY_LARGE (0x4000)
  ------------------
  230|   229k|  }
  231|  1.32M|  if (fbr > 0 && fbc < nhfb - 1) {
  ------------------
  |  Branch (231:7): [True: 1.20M, False: 125k]
  |  Branch (231:18): [True: 1.09M, False: 104k]
  ------------------
  232|  1.09M|    copy_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
  ------------------
  |  |   26|  1.09M|#define CDEF_HBORDER (8)
  ------------------
                  copy_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
  ------------------
  |  |   28|  1.09M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.09M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  233|  1.09M|              &top_linebuf[fb_info->coffset + hsize], stride, CDEF_VBORDER,
  ------------------
  |  |   23|  1.09M|#define CDEF_VBORDER (2)
  ------------------
  234|  1.09M|              CDEF_HBORDER);
  ------------------
  |  |   26|  1.09M|#define CDEF_HBORDER (8)
  ------------------
  235|  1.09M|  } else {
  236|   230k|    fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER,
  ------------------
  |  |   26|   230k|#define CDEF_HBORDER (8)
  ------------------
                  fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER,
  ------------------
  |  |   28|   230k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   230k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
                  fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER,
  ------------------
  |  |   23|   230k|#define CDEF_VBORDER (2)
  ------------------
  237|   230k|              CDEF_HBORDER, CDEF_VERY_LARGE);
  ------------------
  |  |   26|   230k|#define CDEF_HBORDER (8)
  ------------------
                            CDEF_HBORDER, CDEF_VERY_LARGE);
  ------------------
  |  |   30|   230k|#define CDEF_VERY_LARGE (0x4000)
  ------------------
  238|   230k|  }
  239|  1.32M|  if (cdef_left) {
  ------------------
  |  Branch (239:7): [True: 1.30M, False: 18.9k]
  ------------------
  240|       |    /* If we deringed the superblock on the left then we need to copy in
  241|       |    saved pixels. */
  242|  1.30M|    copy_rect(src, CDEF_BSTRIDE, colbuf[plane], CDEF_HBORDER,
  ------------------
  |  |   28|  1.30M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.30M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
                  copy_rect(src, CDEF_BSTRIDE, colbuf[plane], CDEF_HBORDER,
  ------------------
  |  |   26|  1.30M|#define CDEF_HBORDER (8)
  ------------------
  243|  1.30M|              rend + CDEF_VBORDER, CDEF_HBORDER);
  ------------------
  |  |   23|  1.30M|#define CDEF_VBORDER (2)
  ------------------
                            rend + CDEF_VBORDER, CDEF_HBORDER);
  ------------------
  |  |   26|  1.30M|#define CDEF_HBORDER (8)
  ------------------
  244|  1.30M|  }
  245|       |  /* Saving pixels in case we need to dering the superblock on the
  246|       |  right. */
  247|  1.32M|  copy_rect(colbuf[plane], CDEF_HBORDER, src + hsize, CDEF_BSTRIDE,
  ------------------
  |  |   26|  1.32M|#define CDEF_HBORDER (8)
  ------------------
                copy_rect(colbuf[plane], CDEF_HBORDER, src + hsize, CDEF_BSTRIDE,
  ------------------
  |  |   28|  1.32M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  1.32M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  248|  1.32M|            rend + CDEF_VBORDER, CDEF_HBORDER);
  ------------------
  |  |   23|  1.32M|#define CDEF_VBORDER (2)
  ------------------
                          rend + CDEF_VBORDER, CDEF_HBORDER);
  ------------------
  |  |   26|  1.32M|#define CDEF_HBORDER (8)
  ------------------
  249|       |
  250|  1.32M|  if (fb_info->frame_boundary[LEFT]) {
  ------------------
  |  Branch (250:7): [True: 149k, False: 1.17M]
  ------------------
  251|   149k|    fill_rect(src, CDEF_BSTRIDE, vsize + 2 * CDEF_VBORDER, CDEF_HBORDER,
  ------------------
  |  |   28|   149k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   149k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
                  fill_rect(src, CDEF_BSTRIDE, vsize + 2 * CDEF_VBORDER, CDEF_HBORDER,
  ------------------
  |  |   23|   149k|#define CDEF_VBORDER (2)
  ------------------
                  fill_rect(src, CDEF_BSTRIDE, vsize + 2 * CDEF_VBORDER, CDEF_HBORDER,
  ------------------
  |  |   26|   149k|#define CDEF_HBORDER (8)
  ------------------
  252|   149k|              CDEF_VERY_LARGE);
  ------------------
  |  |   30|   149k|#define CDEF_VERY_LARGE (0x4000)
  ------------------
  253|   149k|  }
  254|  1.32M|  if (fb_info->frame_boundary[RIGHT]) {
  ------------------
  |  Branch (254:7): [True: 149k, False: 1.17M]
  ------------------
  255|   149k|    fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
  ------------------
  |  |   26|   149k|#define CDEF_HBORDER (8)
  ------------------
                  fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
  ------------------
  |  |   28|   149k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   149k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  256|   149k|              vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
  ------------------
  |  |   23|   149k|#define CDEF_VBORDER (2)
  ------------------
                            vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
  ------------------
  |  |   26|   149k|#define CDEF_HBORDER (8)
  ------------------
                            vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
  ------------------
  |  |   30|   149k|#define CDEF_VERY_LARGE (0x4000)
  ------------------
  257|   149k|  }
  258|  1.32M|}
cdef.c:copy_rect:
  133|  9.36M|                             int sstride, int v, int h) {
  134|   140M|  for (int i = 0; i < v; i++) {
  ------------------
  |  Branch (134:19): [True: 131M, False: 9.36M]
  ------------------
  135|  1.34G|    for (int j = 0; j < h; j++) {
  ------------------
  |  Branch (135:21): [True: 1.21G, False: 131M]
  ------------------
  136|  1.21G|      dst[i * dstride + j] = src[i * sstride + j];
  137|  1.21G|    }
  138|   131M|  }
  139|  9.36M|}
cdef.c:cdef_filter_fb:
  261|  1.33M|                                  uint8_t use_highbitdepth) {
  262|  1.33M|  ptrdiff_t offset =
  263|  1.33M|      (ptrdiff_t)fb_info->dst_stride * fb_info->roffset + fb_info->coffset;
  264|  1.33M|  if (use_highbitdepth) {
  ------------------
  |  Branch (264:7): [True: 658k, False: 672k]
  ------------------
  265|   658k|    av1_cdef_filter_fb(
  266|   658k|        NULL, CONVERT_TO_SHORTPTR(fb_info->dst + offset), fb_info->dst_stride,
  ------------------
  |  |   75|   658k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  267|   658k|        &fb_info->src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER],
  ------------------
  |  |   23|   658k|#define CDEF_VBORDER (2)
  ------------------
                      &fb_info->src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER],
  ------------------
  |  |   28|   658k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   658k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
                      &fb_info->src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER],
  ------------------
  |  |   26|   658k|#define CDEF_HBORDER (8)
  ------------------
  268|   658k|        fb_info->xdec, fb_info->ydec, fb_info->dir, NULL, fb_info->var, plane,
  269|   658k|        fb_info->dlist, fb_info->cdef_count, fb_info->level,
  270|   658k|        fb_info->sec_strength, fb_info->damping, fb_info->coeff_shift);
  271|   672k|  } else {
  272|   672k|    av1_cdef_filter_fb(
  273|   672k|        fb_info->dst + offset, NULL, fb_info->dst_stride,
  274|   672k|        &fb_info->src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER],
  ------------------
  |  |   23|   672k|#define CDEF_VBORDER (2)
  ------------------
                      &fb_info->src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER],
  ------------------
  |  |   28|   672k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|   672k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
                      &fb_info->src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER],
  ------------------
  |  |   26|   672k|#define CDEF_HBORDER (8)
  ------------------
  275|   672k|        fb_info->xdec, fb_info->ydec, fb_info->dir, NULL, fb_info->var, plane,
  276|   672k|        fb_info->dlist, fb_info->cdef_count, fb_info->level,
  277|   672k|        fb_info->sec_strength, fb_info->damping, fb_info->coeff_shift);
  278|   672k|  }
  279|  1.33M|}

av1_cdef_filter_fb:
  328|  1.33M|                        int sec_strength, int damping, int coeff_shift) {
  329|  1.33M|  int bi;
  330|  1.33M|  int bx;
  331|  1.33M|  int by;
  332|  1.33M|  const int pri_strength = level << coeff_shift;
  333|  1.33M|  sec_strength <<= coeff_shift;
  334|  1.33M|  damping += coeff_shift - (pli != AOM_PLANE_Y);
  ------------------
  |  |  226|  1.33M|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  335|  1.33M|  const int bw_log2 = 3 - xdec;
  336|  1.33M|  const int bh_log2 = 3 - ydec;
  337|  1.33M|  if (dirinit && pri_strength == 0 && sec_strength == 0) {
  ------------------
  |  Branch (337:7): [True: 0, False: 1.33M]
  |  Branch (337:18): [True: 0, False: 0]
  |  Branch (337:39): [True: 0, False: 0]
  ------------------
  338|       |    // If we're here, both primary and secondary strengths are 0, and
  339|       |    // we still haven't written anything to y[] yet, so we just copy
  340|       |    // the input to y[]. This is necessary only for av1_cdef_search()
  341|       |    // and only av1_cdef_search() sets dirinit.
  342|      0|    for (bi = 0; bi < cdef_count; bi++) {
  ------------------
  |  Branch (342:18): [True: 0, False: 0]
  ------------------
  343|      0|      by = dlist[bi].by;
  344|      0|      bx = dlist[bi].bx;
  345|       |      // TODO(stemidts/jmvalin): SIMD optimisations
  346|      0|      for (int iy = 0; iy < 1 << bh_log2; iy++) {
  ------------------
  |  Branch (346:24): [True: 0, False: 0]
  ------------------
  347|      0|        memcpy(&dst16[(bi << (bw_log2 + bh_log2)) + (iy << bw_log2)],
  348|      0|               &in[((by << bh_log2) + iy) * CDEF_BSTRIDE + (bx << bw_log2)],
  ------------------
  |  |   28|      0|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|      0|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  349|      0|               ((size_t)1 << bw_log2) * sizeof(*dst16));
  350|      0|      }
  351|      0|    }
  352|      0|    return;
  353|      0|  }
  354|       |
  355|  1.33M|  if (pli == 0) {
  ------------------
  |  Branch (355:7): [True: 456k, False: 874k]
  ------------------
  356|   456k|    if (!dirinit || !*dirinit) {
  ------------------
  |  Branch (356:9): [True: 456k, False: 0]
  |  Branch (356:21): [True: 0, False: 0]
  ------------------
  357|   456k|      aom_cdef_find_dir(in, dlist, var, cdef_count, coeff_shift, dir);
  358|   456k|      if (dirinit) *dirinit = 1;
  ------------------
  |  Branch (358:11): [True: 0, False: 456k]
  ------------------
  359|   456k|    }
  360|   456k|  }
  361|  1.33M|  if (pli == 1 && xdec != ydec) {
  ------------------
  |  Branch (361:7): [True: 437k, False: 894k]
  |  Branch (361:19): [True: 3.17k, False: 434k]
  ------------------
  362|  21.1k|    for (bi = 0; bi < cdef_count; bi++) {
  ------------------
  |  Branch (362:18): [True: 17.9k, False: 3.17k]
  ------------------
  363|  17.9k|      static const int conv422[8] = { 7, 0, 2, 4, 5, 6, 6, 6 };
  364|  17.9k|      static const int conv440[8] = { 1, 2, 2, 2, 3, 4, 6, 0 };
  365|  17.9k|      by = dlist[bi].by;
  366|  17.9k|      bx = dlist[bi].bx;
  367|  17.9k|      dir[by][bx] = (xdec ? conv422 : conv440)[dir[by][bx]];
  ------------------
  |  Branch (367:22): [True: 17.9k, False: 0]
  ------------------
  368|  17.9k|    }
  369|  3.17k|  }
  370|       |
  371|  1.33M|  if (dst8) {
  ------------------
  |  Branch (371:7): [True: 672k, False: 659k]
  ------------------
  372|   672k|    const int block_width = 8 >> xdec;
  373|   672k|    const int block_height = 8 >> ydec;
  374|       |    /*
  375|       |     * strength_index == 0 : enable_primary = 1, enable_secondary = 1
  376|       |     * strength_index == 1 : enable_primary = 1, enable_secondary = 0
  377|       |     * strength_index == 2 : enable_primary = 0, enable_secondary = 1
  378|       |     * strength_index == 3 : enable_primary = 0, enable_secondary = 0
  379|       |     */
  380|   672k|    const cdef_filter_block_func cdef_filter_fn[4] = {
  381|   672k|      cdef_filter_8_0, cdef_filter_8_1, cdef_filter_8_2, cdef_filter_8_3
  382|   672k|    };
  383|       |
  384|  36.1M|    for (bi = 0; bi < cdef_count; bi++) {
  ------------------
  |  Branch (384:18): [True: 35.4M, False: 672k]
  ------------------
  385|  35.4M|      by = dlist[bi].by;
  386|  35.4M|      bx = dlist[bi].bx;
  387|  35.4M|      const int t =
  388|  35.4M|          (pli ? pri_strength : adjust_strength(pri_strength, var[by][bx]));
  ------------------
  |  Branch (388:12): [True: 24.4M, False: 10.9M]
  ------------------
  389|  35.4M|      const int strength_index = (sec_strength == 0) | ((t == 0) << 1);
  390|       |
  391|  35.4M|      cdef_filter_fn[strength_index](
  392|  35.4M|          &dst8[(by << bh_log2) * dstride + (bx << bw_log2)], dstride,
  393|  35.4M|          &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)], t,
  ------------------
  |  |   28|  35.4M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  35.4M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  394|  35.4M|          sec_strength, pri_strength ? dir[by][bx] : 0, damping, damping,
  ------------------
  |  Branch (394:25): [True: 32.2M, False: 3.21M]
  ------------------
  395|  35.4M|          coeff_shift, block_width, block_height);
  396|  35.4M|    }
  397|   672k|  } else {
  398|   659k|    const int block_width = 8 >> xdec;
  399|   659k|    const int block_height = 8 >> ydec;
  400|       |    /*
  401|       |     * strength_index == 0 : enable_primary = 1, enable_secondary = 1
  402|       |     * strength_index == 1 : enable_primary = 1, enable_secondary = 0
  403|       |     * strength_index == 2 : enable_primary = 0, enable_secondary = 1
  404|       |     * strength_index == 3 : enable_primary = 0, enable_secondary = 0
  405|       |     */
  406|   659k|    const cdef_filter_block_func cdef_filter_fn[4] = {
  407|   659k|      cdef_filter_16_0, cdef_filter_16_1, cdef_filter_16_2, cdef_filter_16_3
  408|   659k|    };
  409|       |
  410|  30.0M|    for (bi = 0; bi < cdef_count; bi++) {
  ------------------
  |  Branch (410:18): [True: 29.4M, False: 659k]
  ------------------
  411|  29.4M|      by = dlist[bi].by;
  412|  29.4M|      bx = dlist[bi].bx;
  413|  29.4M|      const int t =
  414|  29.4M|          (pli ? pri_strength : adjust_strength(pri_strength, var[by][bx]));
  ------------------
  |  Branch (414:12): [True: 22.8M, False: 6.61M]
  ------------------
  415|  29.4M|      const int strength_index = (sec_strength == 0) | ((t == 0) << 1);
  416|       |
  417|  29.4M|      cdef_filter_fn[strength_index](
  418|  29.4M|          &dst16[dirinit ? bi << (bw_log2 + bh_log2)
  ------------------
  |  Branch (418:18): [True: 0, False: 29.4M]
  ------------------
  419|  29.4M|                         : (by << bh_log2) * dstride + (bx << bw_log2)],
  420|  29.4M|          dirinit ? 1 << bw_log2 : dstride,
  ------------------
  |  Branch (420:11): [True: 0, False: 29.4M]
  ------------------
  421|  29.4M|          &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)], t,
  ------------------
  |  |   28|  29.4M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  29.4M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  422|  29.4M|          sec_strength, pri_strength ? dir[by][bx] : 0, damping, damping,
  ------------------
  |  Branch (422:25): [True: 27.1M, False: 2.26M]
  ------------------
  423|  29.4M|          coeff_shift, block_width, block_height);
  424|  29.4M|    }
  425|   659k|  }
  426|  1.33M|}
cdef_block.c:aom_cdef_find_dir:
  298|   456k|                                     int dir[CDEF_NBLOCKS][CDEF_NBLOCKS]) {
  299|   456k|  int bi;
  300|       |
  301|       |  // Find direction of two 8x8 blocks together.
  302|  13.8M|  for (bi = 0; bi < cdef_count - 1; bi += 2) {
  ------------------
  |  Branch (302:16): [True: 13.3M, False: 456k]
  ------------------
  303|  13.3M|    const int by = dlist[bi].by;
  304|  13.3M|    const int bx = dlist[bi].bx;
  305|  13.3M|    const int by2 = dlist[bi + 1].by;
  306|  13.3M|    const int bx2 = dlist[bi + 1].bx;
  307|  13.3M|    const int pos1 = 8 * by * CDEF_BSTRIDE + 8 * bx;
  ------------------
  |  |   28|  13.3M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  13.3M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  308|  13.3M|    const int pos2 = 8 * by2 * CDEF_BSTRIDE + 8 * bx2;
  ------------------
  |  |   28|  13.3M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  13.3M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  309|  13.3M|    cdef_find_dir_dual(&in[pos1], &in[pos2], CDEF_BSTRIDE, &var[by][bx],
  ------------------
  |  |   28|  13.3M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  13.3M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  310|  13.3M|                       &var[by2][bx2], coeff_shift, &dir[by][bx],
  311|  13.3M|                       &dir[by2][bx2]);
  312|  13.3M|  }
  313|       |
  314|       |  // Process remaining 8x8 blocks here. One 8x8 at a time.
  315|   456k|  if (cdef_count % 2) {
  ------------------
  |  Branch (315:7): [True: 8.18k, False: 448k]
  ------------------
  316|  8.18k|    const int by = dlist[bi].by;
  317|  8.18k|    const int bx = dlist[bi].bx;
  318|  8.18k|    dir[by][bx] = cdef_find_dir(&in[8 * by * CDEF_BSTRIDE + 8 * bx],
  ------------------
  |  |   28|  8.18k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  8.18k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  319|  8.18k|                                CDEF_BSTRIDE, &var[by][bx], coeff_shift);
  ------------------
  |  |   28|  8.18k|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  8.18k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  320|  8.18k|  }
  321|   456k|}
cdef_block.c:adjust_strength:
  289|  24.1M|static inline int adjust_strength(int strength, int32_t var) {
  290|  24.1M|  const int i = var >> 6 ? AOMMIN(get_msb(var >> 6), 12) : 0;
  ------------------
  |  |   34|  14.3M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 14.3M, False: 31.3k]
  |  |  ------------------
  ------------------
  |  Branch (290:17): [True: 14.3M, False: 9.77M]
  ------------------
  291|       |  /* We use the variance of 8x8 blocks to adjust the strength. */
  292|  24.1M|  return var ? (strength * (4 + i) + 8) >> 4 : 0;
  ------------------
  |  Branch (292:10): [True: 17.1M, False: 6.99M]
  ------------------
  293|  24.1M|}

cdef.c:fill_rect:
   58|  1.53M|                             uint16_t x) {
   59|  16.1M|  for (int i = 0; i < v; i++) {
  ------------------
  |  Branch (59:19): [True: 14.5M, False: 1.53M]
  ------------------
   60|   148M|    for (int j = 0; j < h; j++) {
  ------------------
  |  Branch (60:21): [True: 133M, False: 14.5M]
  ------------------
   61|   133M|      dst[i * dstride + j] = x;
   62|   133M|    }
   63|  14.5M|  }
   64|  1.53M|}

cdef_find_dir_avx2:
  163|  8.18k|                             int coeff_shift) {
  164|  8.18k|  int i;
  165|  8.18k|  int32_t cost[8];
  166|  8.18k|  int32_t best_cost = 0;
  167|  8.18k|  int best_dir = 0;
  168|  8.18k|  v128 lines[8];
  169|  73.6k|  for (i = 0; i < 8; i++) {
  ------------------
  |  Branch (169:15): [True: 65.4k, False: 8.18k]
  ------------------
  170|  65.4k|    lines[i] = v128_load_unaligned(&img[i * stride]);
  171|  65.4k|    lines[i] =
  172|  65.4k|        v128_sub_16(v128_shr_s16(lines[i], coeff_shift), v128_dup_16(128));
  173|  65.4k|  }
  174|       |
  175|       |  /* Compute "mostly vertical" directions. */
  176|  8.18k|  v128 dir47 = compute_directions(lines, cost + 4);
  177|       |
  178|  8.18k|  array_reverse_transpose_8x8(lines, lines);
  179|       |
  180|       |  /* Compute "mostly horizontal" directions. */
  181|  8.18k|  v128 dir03 = compute_directions(lines, cost);
  182|       |
  183|  8.18k|  v128 max = v128_max_s32(dir03, dir47);
  184|  8.18k|  max = v128_max_s32(max, v128_align(max, max, 8));
  ------------------
  |  |   75|  8.18k|#define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, (uint8_t)(c)) : (b))
  |  |  ------------------
  |  |  |  Branch (75:30): [Folded - Ignored]
  |  |  ------------------
  ------------------
  185|  8.18k|  max = v128_max_s32(max, v128_align(max, max, 4));
  ------------------
  |  |   75|  8.18k|#define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, (uint8_t)(c)) : (b))
  |  |  ------------------
  |  |  |  Branch (75:30): [Folded - Ignored]
  |  |  ------------------
  ------------------
  186|  8.18k|  best_cost = v128_low_u32(max);
  187|  8.18k|  v128 t =
  188|  8.18k|      v128_pack_s32_s16(v128_cmpeq_32(max, dir47), v128_cmpeq_32(max, dir03));
  189|  8.18k|  best_dir = v128_movemask_8(v128_pack_s16_s8(t, t));
  190|  8.18k|  best_dir = get_msb(best_dir ^ (best_dir - 1));  // Count trailing zeros
  191|       |
  192|       |  /* Difference between the optimal variance and the variance along the
  193|       |     orthogonal direction. Again, the sum(x^2) terms cancel out. */
  194|  8.18k|  *var = best_cost - cost[(best_dir + 4) & 7];
  195|       |  /* We'd normally divide by 840, but dividing by 1024 is close enough
  196|       |     for what we're going to do with this. */
  197|  8.18k|  *var >>= 10;
  198|  8.18k|  return best_dir;
  199|  8.18k|}
cdef_filter_8_0_avx2:
  687|  26.5M|                                int block_height) {
  688|  26.5M|  if (block_width == 8) {
  ------------------
  |  Branch (688:7): [True: 4.73M, False: 21.7M]
  ------------------
  689|  4.73M|    filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
  690|  4.73M|                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
  691|  4.73M|                     block_height, /*enable_primary=*/1,
  692|  4.73M|                     /*enable_secondary=*/1);
  693|  21.7M|  } else {
  694|  21.7M|    filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
  695|  21.7M|                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
  696|  21.7M|                     block_height, /*enable_primary=*/1,
  697|  21.7M|                     /*enable_secondary=*/1);
  698|  21.7M|  }
  699|  26.5M|}
cdef_filter_8_1_avx2:
  705|  3.15M|                                int block_height) {
  706|  3.15M|  if (block_width == 8) {
  ------------------
  |  Branch (706:7): [True: 1.80M, False: 1.35M]
  ------------------
  707|  1.80M|    filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
  708|  1.80M|                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
  709|  1.80M|                     block_height, /*enable_primary=*/1,
  710|  1.80M|                     /*enable_secondary=*/0);
  711|  1.80M|  } else {
  712|  1.35M|    filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
  713|  1.35M|                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
  714|  1.35M|                     block_height, /*enable_primary=*/1,
  715|  1.35M|                     /*enable_secondary=*/0);
  716|  1.35M|  }
  717|  3.15M|}
cdef_filter_8_2_avx2:
  722|  4.45M|                                int block_height) {
  723|  4.45M|  if (block_width == 8) {
  ------------------
  |  Branch (723:7): [True: 4.25M, False: 199k]
  ------------------
  724|  4.25M|    filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
  725|  4.25M|                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
  726|  4.25M|                     block_height, /*enable_primary=*/0,
  727|  4.25M|                     /*enable_secondary=*/1);
  728|  4.25M|  } else {
  729|   199k|    filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
  730|   199k|                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
  731|   199k|                     block_height, /*enable_primary=*/0,
  732|   199k|                     /*enable_secondary=*/1);
  733|   199k|  }
  734|  4.45M|}
cdef_filter_8_3_avx2:
  740|  2.79M|                                int block_height) {
  741|  2.79M|  (void)pri_strength;
  742|  2.79M|  (void)sec_strength;
  743|  2.79M|  (void)dir;
  744|  2.79M|  (void)pri_damping;
  745|  2.79M|  (void)sec_damping;
  746|  2.79M|  (void)coeff_shift;
  747|  2.79M|  (void)block_width;
  748|       |
  749|  2.79M|  if (block_width == 8) {
  ------------------
  |  Branch (749:7): [True: 2.79M, False: 18.4E]
  ------------------
  750|  2.79M|    copy_block_8xh(/*is_lowbd=*/1, dest, dstride, in, block_height);
  751|  18.4E|  } else {
  752|  18.4E|    copy_block_4xh(/*is_lowbd=*/1, dest, dstride, in, block_height);
  753|  18.4E|  }
  754|  2.79M|}
cdef_filter_16_0_avx2:
  760|  25.0M|                                 int block_height) {
  761|  25.0M|  if (block_width == 8) {
  ------------------
  |  Branch (761:7): [True: 4.96M, False: 20.0M]
  ------------------
  762|  4.96M|    filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
  763|  4.96M|                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
  764|  4.96M|                     block_height, /*enable_primary=*/1,
  765|  4.96M|                     /*enable_secondary=*/1);
  766|  20.0M|  } else {
  767|  20.0M|    filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
  768|  20.0M|                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
  769|  20.0M|                     block_height, /*enable_primary=*/1,
  770|  20.0M|                     /*enable_secondary=*/1);
  771|  20.0M|  }
  772|  25.0M|}
cdef_filter_16_1_avx2:
  778|  3.22M|                                 int block_height) {
  779|  3.22M|  if (block_width == 8) {
  ------------------
  |  Branch (779:7): [True: 1.94M, False: 1.27M]
  ------------------
  780|  1.94M|    filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
  781|  1.94M|                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
  782|  1.94M|                     block_height, /*enable_primary=*/1,
  783|  1.94M|                     /*enable_secondary=*/0);
  784|  1.94M|  } else {
  785|  1.27M|    filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
  786|  1.27M|                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
  787|  1.27M|                     block_height, /*enable_primary=*/1,
  788|  1.27M|                     /*enable_secondary=*/0);
  789|  1.27M|  }
  790|  3.22M|}
cdef_filter_16_2_avx2:
  795|  3.63M|                                 int block_height) {
  796|  3.63M|  if (block_width == 8) {
  ------------------
  |  Branch (796:7): [True: 3.49M, False: 137k]
  ------------------
  797|  3.49M|    filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
  798|  3.49M|                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
  799|  3.49M|                     block_height, /*enable_primary=*/0,
  800|  3.49M|                     /*enable_secondary=*/1);
  801|  3.49M|  } else {
  802|   137k|    filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
  803|   137k|                     sec_strength, dir, pri_damping, sec_damping, coeff_shift,
  804|   137k|                     block_height, /*enable_primary=*/0,
  805|   137k|                     /*enable_secondary=*/1);
  806|   137k|  }
  807|  3.63M|}
cdef_filter_16_3_avx2:
  813|  2.43M|                                 int block_height) {
  814|  2.43M|  (void)pri_strength;
  815|  2.43M|  (void)sec_strength;
  816|  2.43M|  (void)dir;
  817|  2.43M|  (void)pri_damping;
  818|  2.43M|  (void)sec_damping;
  819|  2.43M|  (void)coeff_shift;
  820|  2.43M|  (void)block_width;
  821|  2.43M|  if (block_width == 8) {
  ------------------
  |  Branch (821:7): [True: 2.43M, False: 18.4E]
  ------------------
  822|  2.43M|    copy_block_8xh(/*is_lowbd=*/0, dest, dstride, in, block_height);
  823|  18.4E|  } else {
  824|  18.4E|    copy_block_4xh(/*is_lowbd=*/0, dest, dstride, in, block_height);
  825|  18.4E|  }
  826|  2.43M|}
cdef_copy_rect8_16bit_to_16bit_avx2:
  831|   809k|                                               int width, int height) {
  832|   809k|  int i, j;
  833|  24.1M|  for (i = 0; i < height; i++) {
  ------------------
  |  Branch (833:15): [True: 23.3M, False: 809k]
  ------------------
  834|   179M|    for (j = 0; j < (width & ~0x7); j += 8) {
  ------------------
  |  Branch (834:17): [True: 156M, False: 23.3M]
  ------------------
  835|   156M|      v128 row = v128_load_unaligned(&src[i * sstride + j]);
  836|   156M|      v128_store_unaligned(&dst[i * dstride + j], row);
  837|   156M|    }
  838|  26.4M|    for (; j < width; j++) {
  ------------------
  |  Branch (838:12): [True: 3.17M, False: 23.3M]
  ------------------
  839|  3.17M|      dst[i * dstride + j] = src[i * sstride + j];
  840|  3.17M|    }
  841|  23.3M|  }
  842|   809k|}
cdef_block_avx2.c:compute_directions:
   62|  16.3k|static inline v128 compute_directions(v128 lines[8], int32_t tmp_cost1[4]) {
   63|  16.3k|  v128 partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
   64|  16.3k|  v128 partial6;
   65|  16.3k|  v128 tmp;
   66|       |  /* Partial sums for lines 0 and 1. */
   67|  16.3k|  partial4a = v128_shl_n_byte(lines[0], 14);
  ------------------
  |  |  595|  16.3k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127)
  ------------------
   68|  16.3k|  partial4b = v128_shr_n_byte(lines[0], 2);
  ------------------
  |  |  596|  16.3k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127)
  ------------------
   69|  16.3k|  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[1], 12));
  ------------------
  |  |  595|  16.3k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127)
  ------------------
   70|  16.3k|  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[1], 4));
  ------------------
  |  |  596|  16.3k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127)
  ------------------
   71|  16.3k|  tmp = v128_add_16(lines[0], lines[1]);
   72|  16.3k|  partial5a = v128_shl_n_byte(tmp, 10);
  ------------------
  |  |  595|  16.3k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127)
  ------------------
   73|  16.3k|  partial5b = v128_shr_n_byte(tmp, 6);
  ------------------
  |  |  596|  16.3k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127)
  ------------------
   74|  16.3k|  partial7a = v128_shl_n_byte(tmp, 4);
  ------------------
  |  |  595|  16.3k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127)
  ------------------
   75|  16.3k|  partial7b = v128_shr_n_byte(tmp, 12);
  ------------------
  |  |  596|  16.3k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127)
  ------------------
   76|  16.3k|  partial6 = tmp;
   77|       |
   78|       |  /* Partial sums for lines 2 and 3. */
   79|  16.3k|  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[2], 10));
  ------------------
  |  |  595|  16.3k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127)
  ------------------
   80|  16.3k|  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[2], 6));
  ------------------
  |  |  596|  16.3k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127)
  ------------------
   81|  16.3k|  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[3], 8));
  ------------------
  |  |  595|  16.3k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127)
  ------------------
   82|  16.3k|  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[3], 8));
  ------------------
  |  |  596|  16.3k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127)
  ------------------
   83|  16.3k|  tmp = v128_add_16(lines[2], lines[3]);
   84|  16.3k|  partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 8));
  ------------------
  |  |  595|  16.3k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127)
  ------------------
   85|  16.3k|  partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 8));
  ------------------
  |  |  596|  16.3k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127)
  ------------------
   86|  16.3k|  partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 6));
  ------------------
  |  |  595|  16.3k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127)
  ------------------
   87|  16.3k|  partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 10));
  ------------------
  |  |  596|  16.3k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127)
  ------------------
   88|  16.3k|  partial6 = v128_add_16(partial6, tmp);
   89|       |
   90|       |  /* Partial sums for lines 4 and 5. */
   91|  16.3k|  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[4], 6));
  ------------------
  |  |  595|  16.3k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127)
  ------------------
   92|  16.3k|  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[4], 10));
  ------------------
  |  |  596|  16.3k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127)
  ------------------
   93|  16.3k|  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[5], 4));
  ------------------
  |  |  595|  16.3k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127)
  ------------------
   94|  16.3k|  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[5], 12));
  ------------------
  |  |  596|  16.3k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127)
  ------------------
   95|  16.3k|  tmp = v128_add_16(lines[4], lines[5]);
   96|  16.3k|  partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 6));
  ------------------
  |  |  595|  16.3k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127)
  ------------------
   97|  16.3k|  partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 10));
  ------------------
  |  |  596|  16.3k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127)
  ------------------
   98|  16.3k|  partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 8));
  ------------------
  |  |  595|  16.3k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127)
  ------------------
   99|  16.3k|  partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 8));
  ------------------
  |  |  596|  16.3k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127)
  ------------------
  100|  16.3k|  partial6 = v128_add_16(partial6, tmp);
  101|       |
  102|       |  /* Partial sums for lines 6 and 7. */
  103|  16.3k|  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[6], 2));
  ------------------
  |  |  595|  16.3k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127)
  ------------------
  104|  16.3k|  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[6], 14));
  ------------------
  |  |  596|  16.3k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127)
  ------------------
  105|  16.3k|  partial4a = v128_add_16(partial4a, lines[7]);
  106|  16.3k|  tmp = v128_add_16(lines[6], lines[7]);
  107|  16.3k|  partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 4));
  ------------------
  |  |  595|  16.3k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127)
  ------------------
  108|  16.3k|  partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 12));
  ------------------
  |  |  596|  16.3k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127)
  ------------------
  109|  16.3k|  partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 10));
  ------------------
  |  |  595|  16.3k|#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127)
  ------------------
  110|  16.3k|  partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 6));
  ------------------
  |  |  596|  16.3k|#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127)
  ------------------
  111|  16.3k|  partial6 = v128_add_16(partial6, tmp);
  112|       |
  113|       |  /* Compute costs in terms of partial sums. */
  114|  16.3k|  partial4a =
  115|  16.3k|      fold_mul_and_sum(partial4a, partial4b, v128_from_32(210, 280, 420, 840),
  116|  16.3k|                       v128_from_32(105, 120, 140, 168));
  117|  16.3k|  partial7a =
  118|  16.3k|      fold_mul_and_sum(partial7a, partial7b, v128_from_32(210, 420, 0, 0),
  119|  16.3k|                       v128_from_32(105, 105, 105, 140));
  120|  16.3k|  partial5a =
  121|  16.3k|      fold_mul_and_sum(partial5a, partial5b, v128_from_32(210, 420, 0, 0),
  122|  16.3k|                       v128_from_32(105, 105, 105, 140));
  123|  16.3k|  partial6 = v128_madd_s16(partial6, partial6);
  124|  16.3k|  partial6 = v128_mullo_s32(partial6, v128_dup_32(105));
  125|       |
  126|  16.3k|  partial4a = hsum4(partial4a, partial5a, partial6, partial7a);
  127|  16.3k|  v128_store_unaligned(tmp_cost1, partial4a);
  128|  16.3k|  return partial4a;
  129|  16.3k|}
cdef_block_avx2.c:fold_mul_and_sum:
   27|  49.0k|                                    v128 const2) {
   28|  49.0k|  v128 tmp;
   29|       |  /* Reverse partial B. */
   30|  49.0k|  partialb = v128_shuffle_8(
   31|  49.0k|      partialb, v128_from_32(0x0f0e0100, 0x03020504, 0x07060908, 0x0b0a0d0c));
   32|       |  /* Interleave the x and y values of identical indices and pair x8 with 0. */
   33|  49.0k|  tmp = partiala;
   34|  49.0k|  partiala = v128_ziplo_16(partialb, partiala);
   35|  49.0k|  partialb = v128_ziphi_16(partialb, tmp);
   36|       |  /* Square and add the corresponding x and y values. */
   37|  49.0k|  partiala = v128_madd_s16(partiala, partiala);
   38|  49.0k|  partialb = v128_madd_s16(partialb, partialb);
   39|       |  /* Multiply by constant. */
   40|  49.0k|  partiala = v128_mullo_s32(partiala, const1);
   41|  49.0k|  partialb = v128_mullo_s32(partialb, const2);
   42|       |  /* Sum all results. */
   43|  49.0k|  partiala = v128_add_32(partiala, partialb);
   44|  49.0k|  return partiala;
   45|  49.0k|}
cdef_block_avx2.c:hsum4:
   47|  16.3k|static inline v128 hsum4(v128 x0, v128 x1, v128 x2, v128 x3) {
   48|  16.3k|  v128 t0, t1, t2, t3;
   49|  16.3k|  t0 = v128_ziplo_32(x1, x0);
   50|  16.3k|  t1 = v128_ziplo_32(x3, x2);
   51|  16.3k|  t2 = v128_ziphi_32(x1, x0);
   52|  16.3k|  t3 = v128_ziphi_32(x3, x2);
   53|  16.3k|  x0 = v128_ziplo_64(t1, t0);
   54|  16.3k|  x1 = v128_ziphi_64(t1, t0);
   55|  16.3k|  x2 = v128_ziplo_64(t3, t2);
   56|  16.3k|  x3 = v128_ziphi_64(t3, t2);
   57|  16.3k|  return v128_add_32(v128_add_32(x0, x1), v128_add_32(x2, x3));
   58|  16.3k|}
cdef_block_avx2.c:array_reverse_transpose_8x8:
  133|  8.18k|static inline void array_reverse_transpose_8x8(v128 *in, v128 *res) {
  134|  8.18k|  const v128 tr0_0 = v128_ziplo_16(in[1], in[0]);
  135|  8.18k|  const v128 tr0_1 = v128_ziplo_16(in[3], in[2]);
  136|  8.18k|  const v128 tr0_2 = v128_ziphi_16(in[1], in[0]);
  137|  8.18k|  const v128 tr0_3 = v128_ziphi_16(in[3], in[2]);
  138|  8.18k|  const v128 tr0_4 = v128_ziplo_16(in[5], in[4]);
  139|  8.18k|  const v128 tr0_5 = v128_ziplo_16(in[7], in[6]);
  140|  8.18k|  const v128 tr0_6 = v128_ziphi_16(in[5], in[4]);
  141|  8.18k|  const v128 tr0_7 = v128_ziphi_16(in[7], in[6]);
  142|       |
  143|  8.18k|  const v128 tr1_0 = v128_ziplo_32(tr0_1, tr0_0);
  144|  8.18k|  const v128 tr1_1 = v128_ziplo_32(tr0_5, tr0_4);
  145|  8.18k|  const v128 tr1_2 = v128_ziphi_32(tr0_1, tr0_0);
  146|  8.18k|  const v128 tr1_3 = v128_ziphi_32(tr0_5, tr0_4);
  147|  8.18k|  const v128 tr1_4 = v128_ziplo_32(tr0_3, tr0_2);
  148|  8.18k|  const v128 tr1_5 = v128_ziplo_32(tr0_7, tr0_6);
  149|  8.18k|  const v128 tr1_6 = v128_ziphi_32(tr0_3, tr0_2);
  150|  8.18k|  const v128 tr1_7 = v128_ziphi_32(tr0_7, tr0_6);
  151|       |
  152|  8.18k|  res[7] = v128_ziplo_64(tr1_1, tr1_0);
  153|  8.18k|  res[6] = v128_ziphi_64(tr1_1, tr1_0);
  154|  8.18k|  res[5] = v128_ziplo_64(tr1_3, tr1_2);
  155|  8.18k|  res[4] = v128_ziphi_64(tr1_3, tr1_2);
  156|  8.18k|  res[3] = v128_ziplo_64(tr1_5, tr1_4);
  157|  8.18k|  res[2] = v128_ziphi_64(tr1_5, tr1_4);
  158|  8.18k|  res[1] = v128_ziplo_64(tr1_7, tr1_6);
  159|  8.18k|  res[0] = v128_ziphi_64(tr1_7, tr1_6);
  160|  8.18k|}
cdef_block_avx2.c:filter_block_8x8:
  476|  20.4M|                                  int enable_primary, int enable_secondary) {
  477|  20.4M|  uint8_t *dst8 = (uint8_t *)dest;
  478|  20.4M|  uint16_t *dst16 = (uint16_t *)dest;
  479|  20.4M|  const int clipping_required = enable_primary && enable_secondary;
  ------------------
  |  Branch (479:33): [True: 13.0M, False: 7.35M]
  |  Branch (479:51): [True: 9.71M, False: 3.35M]
  ------------------
  480|  20.4M|  int i;
  481|  20.4M|  v256 sum, p0, p1, p2, p3, row, res;
  482|  20.4M|  const v256 cdef_large_value_mask = v256_dup_16((uint16_t)~CDEF_VERY_LARGE);
  ------------------
  |  |   30|  20.4M|#define CDEF_VERY_LARGE (0x4000)
  ------------------
  483|  20.4M|  v256 max, min;
  484|  20.4M|  const int po1 = cdef_directions[dir][0];
  485|  20.4M|  const int po2 = cdef_directions[dir][1];
  486|  20.4M|  const int s1o1 = cdef_directions[dir + 2][0];
  487|  20.4M|  const int s1o2 = cdef_directions[dir + 2][1];
  488|  20.4M|  const int s2o1 = cdef_directions[dir - 2][0];
  489|  20.4M|  const int s2o2 = cdef_directions[dir - 2][1];
  490|  20.4M|  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
  491|  20.4M|  const int *sec_taps = cdef_sec_taps;
  492|       |
  493|  20.4M|  if (enable_primary && pri_strength)
  ------------------
  |  Branch (493:7): [True: 13.3M, False: 7.11M]
  |  Branch (493:25): [True: 13.3M, False: 18.4E]
  ------------------
  494|  13.3M|    pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
  ------------------
  |  |   35|  13.3M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 6.72k, False: 13.3M]
  |  |  ------------------
  ------------------
  495|  20.4M|  if (enable_secondary && sec_strength)
  ------------------
  |  Branch (495:7): [True: 17.3M, False: 3.05M]
  |  Branch (495:27): [True: 17.3M, False: 18.4E]
  ------------------
  496|  17.4M|    sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
  ------------------
  |  |   35|  17.4M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 17.4M]
  |  |  ------------------
  ------------------
  497|       |
  498|  83.4M|  for (i = 0; i < height; i += 2) {
  ------------------
  |  Branch (498:15): [True: 63.0M, False: 20.4M]
  ------------------
  499|  63.0M|    v256 tap[8];
  500|  63.0M|    sum = v256_zero();
  501|  63.0M|    row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]),
  ------------------
  |  |   28|  63.0M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  63.0M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  502|  63.0M|                         v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
  ------------------
  |  |   28|  63.0M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  63.0M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  503|       |
  504|  63.0M|    min = max = row;
  505|  63.0M|    if (enable_primary) {
  ------------------
  |  Branch (505:9): [True: 42.2M, False: 20.7M]
  ------------------
  506|       |      // Primary near taps
  507|  42.2M|      tap[0] = v256_from_v128(
  508|  42.2M|          v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
  ------------------
  |  |   28|  42.2M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  42.2M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  509|  42.2M|          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]));
  ------------------
  |  |   28|  42.2M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  42.2M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  510|  42.2M|      tap[1] = v256_from_v128(
  511|  42.2M|          v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
  ------------------
  |  |   28|  42.2M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  42.2M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  512|  42.2M|          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]));
  ------------------
  |  |   28|  42.2M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  42.2M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  513|  42.2M|      p0 = constrain16(tap[0], row, pri_strength, pri_damping);
  514|  42.2M|      p1 = constrain16(tap[1], row, pri_strength, pri_damping);
  515|       |
  516|       |      // sum += pri_taps[0] * (p0 + p1)
  517|  42.2M|      sum = v256_add_16(
  518|  42.2M|          sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1)));
  519|       |
  520|       |      // Primary far taps
  521|  42.2M|      tap[2] = v256_from_v128(
  522|  42.2M|          v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
  ------------------
  |  |   28|  42.2M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  42.2M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  523|  42.2M|          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]));
  ------------------
  |  |   28|  42.2M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  42.2M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  524|  42.2M|      tap[3] = v256_from_v128(
  525|  42.2M|          v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
  ------------------
  |  |   28|  42.2M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  42.2M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  526|  42.2M|          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]));
  ------------------
  |  |   28|  42.2M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  42.2M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  527|  42.2M|      p0 = constrain16(tap[2], row, pri_strength, pri_damping);
  528|  42.2M|      p1 = constrain16(tap[3], row, pri_strength, pri_damping);
  529|       |
  530|       |      // sum += pri_taps[1] * (p0 + p1)
  531|  42.2M|      sum = v256_add_16(
  532|  42.2M|          sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1)));
  533|       |
  534|  42.2M|      if (clipping_required) {
  ------------------
  |  Branch (534:11): [True: 26.3M, False: 15.9M]
  ------------------
  535|  26.3M|        max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask);
  536|       |
  537|  26.3M|        min = v256_min_s16(min, tap[0]);
  538|  26.3M|        min = v256_min_s16(min, tap[1]);
  539|  26.3M|        min = v256_min_s16(min, tap[2]);
  540|  26.3M|        min = v256_min_s16(min, tap[3]);
  541|  26.3M|      }
  542|       |      // End primary
  543|  42.2M|    }
  544|       |
  545|  63.0M|    if (enable_secondary) {
  ------------------
  |  Branch (545:9): [True: 49.4M, False: 13.5M]
  ------------------
  546|       |      // Secondary near taps
  547|  49.4M|      tap[0] = v256_from_v128(
  548|  49.4M|          v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
  ------------------
  |  |   28|  49.4M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  49.4M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  549|  49.4M|          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]));
  ------------------
  |  |   28|  49.4M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  49.4M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  550|  49.4M|      tap[1] = v256_from_v128(
  551|  49.4M|          v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
  ------------------
  |  |   28|  49.4M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  49.4M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  552|  49.4M|          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]));
  ------------------
  |  |   28|  49.4M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  49.4M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  553|  49.4M|      tap[2] = v256_from_v128(
  554|  49.4M|          v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
  ------------------
  |  |   28|  49.4M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  49.4M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  555|  49.4M|          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]));
  ------------------
  |  |   28|  49.4M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  49.4M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  556|  49.4M|      tap[3] = v256_from_v128(
  557|  49.4M|          v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
  ------------------
  |  |   28|  49.4M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  49.4M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  558|  49.4M|          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]));
  ------------------
  |  |   28|  49.4M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  49.4M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  559|  49.4M|      p0 = constrain16(tap[0], row, sec_strength, sec_damping);
  560|  49.4M|      p1 = constrain16(tap[1], row, sec_strength, sec_damping);
  561|  49.4M|      p2 = constrain16(tap[2], row, sec_strength, sec_damping);
  562|  49.4M|      p3 = constrain16(tap[3], row, sec_strength, sec_damping);
  563|       |
  564|       |      // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
  565|  49.4M|      sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]),
  566|  49.4M|                                            v256_add_16(v256_add_16(p0, p1),
  567|  49.4M|                                                        v256_add_16(p2, p3))));
  568|       |
  569|       |      // Secondary far taps
  570|  49.4M|      tap[4] = v256_from_v128(
  571|  49.4M|          v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
  ------------------
  |  |   28|  49.4M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  49.4M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  572|  49.4M|          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]));
  ------------------
  |  |   28|  49.4M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  49.4M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  573|  49.4M|      tap[5] = v256_from_v128(
  574|  49.4M|          v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
  ------------------
  |  |   28|  49.4M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  49.4M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  575|  49.4M|          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]));
  ------------------
  |  |   28|  49.4M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  49.4M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  576|  49.4M|      tap[6] = v256_from_v128(
  577|  49.4M|          v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
  ------------------
  |  |   28|  49.4M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  49.4M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  578|  49.4M|          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]));
  ------------------
  |  |   28|  49.4M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  49.4M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  579|  49.4M|      tap[7] = v256_from_v128(
  580|  49.4M|          v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
  ------------------
  |  |   28|  49.4M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  49.4M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  581|  49.4M|          v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]));
  ------------------
  |  |   28|  49.4M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  49.4M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  582|  49.4M|      p0 = constrain16(tap[4], row, sec_strength, sec_damping);
  583|  49.4M|      p1 = constrain16(tap[5], row, sec_strength, sec_damping);
  584|  49.4M|      p2 = constrain16(tap[6], row, sec_strength, sec_damping);
  585|  49.4M|      p3 = constrain16(tap[7], row, sec_strength, sec_damping);
  586|       |
  587|       |      // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
  588|  49.4M|      sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]),
  589|  49.4M|                                            v256_add_16(v256_add_16(p0, p1),
  590|  49.4M|                                                        v256_add_16(p2, p3))));
  591|       |
  592|  49.4M|      if (clipping_required) {
  ------------------
  |  Branch (592:11): [True: 25.3M, False: 24.1M]
  ------------------
  593|  25.3M|        max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask);
  594|       |
  595|  25.3M|        min = v256_min_s16(min, tap[0]);
  596|  25.3M|        min = v256_min_s16(min, tap[1]);
  597|  25.3M|        min = v256_min_s16(min, tap[2]);
  598|  25.3M|        min = v256_min_s16(min, tap[3]);
  599|  25.3M|        min = v256_min_s16(min, tap[4]);
  600|  25.3M|        min = v256_min_s16(min, tap[5]);
  601|  25.3M|        min = v256_min_s16(min, tap[6]);
  602|  25.3M|        min = v256_min_s16(min, tap[7]);
  603|  25.3M|      }
  604|       |      // End secondary
  605|  49.4M|    }
  606|       |
  607|       |    // res = row + ((sum - (sum < 0) + 8) >> 4)
  608|  63.0M|    sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
  609|  63.0M|    res = v256_add_16(sum, v256_dup_16(8));
  610|  63.0M|    res = v256_shr_n_s16(res, 4);
  ------------------
  |  |  695|  63.0M|#define v256_shr_n_s16(a, c) _mm256_srai_epi16(a, c)
  ------------------
  611|  63.0M|    res = v256_add_16(row, res);
  612|  63.0M|    if (clipping_required) {
  ------------------
  |  Branch (612:9): [True: 25.3M, False: 37.6M]
  ------------------
  613|  25.3M|      res = v256_min_s16(v256_max_s16(res, min), max);
  614|  25.3M|    }
  615|       |
  616|  63.0M|    if (is_lowbd) {
  ------------------
  |  Branch (616:9): [True: 28.7M, False: 34.2M]
  ------------------
  617|  28.7M|      const v128 res_128 = v256_low_v128(v256_pack_s16_u8(res, res));
  618|  28.7M|      v64_store_aligned(&dst8[i * dstride], v128_high_v64(res_128));
  619|  28.7M|      v64_store_aligned(&dst8[(i + 1) * dstride], v128_low_v64(res_128));
  620|  34.2M|    } else {
  621|  34.2M|      v128_store_unaligned(&dst16[i * dstride], v256_high_v128(res));
  622|  34.2M|      v128_store_unaligned(&dst16[(i + 1) * dstride], v256_low_v128(res));
  623|  34.2M|    }
  624|  63.0M|  }
  625|  20.4M|}
cdef_block_avx2.c:constrain16:
  211|   321M|                             unsigned int adjdamp) {
  212|   321M|  v256 diff = v256_sub_16(a, b);
  213|   321M|  const v256 sign = v256_shr_n_s16(diff, 15);
  ------------------
  |  |  695|   321M|#define v256_shr_n_s16(a, c) _mm256_srai_epi16(a, c)
  ------------------
  214|   321M|  diff = v256_abs_s16(diff);
  215|   321M|  const v256 s =
  216|   321M|      v256_ssub_u16(v256_dup_16(threshold), v256_shr_u16(diff, adjdamp));
  217|   321M|  return v256_xor(v256_add_16(sign, v256_min_s16(diff, s)), sign);
  218|   321M|}
cdef_block_avx2.c:get_max_primary:
  221|  45.6M|                                 v256 cdef_large_value_mask) {
  222|  45.6M|  if (is_lowbd) {
  ------------------
  |  Branch (222:7): [True: 22.8M, False: 22.8M]
  ------------------
  223|  22.8M|    v256 max_u8;
  224|  22.8M|    max_u8 = tap[0];
  225|  22.8M|    max_u8 = v256_max_u8(max_u8, tap[1]);
  226|  22.8M|    max_u8 = v256_max_u8(max_u8, tap[2]);
  227|  22.8M|    max_u8 = v256_max_u8(max_u8, tap[3]);
  228|       |    /* The source is 16 bits, however, we only really care about the lower
  229|       |    8 bits.  The upper 8 bits contain the "large" flag.  After the final
  230|       |    primary max has been calculated, zero out the upper 8 bits.  Use this
  231|       |    to find the "16 bit" max. */
  232|  22.8M|    max = v256_max_s16(max, v256_and(max_u8, cdef_large_value_mask));
  233|  22.8M|  } else {
  234|       |    /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
  235|  22.8M|    max = v256_max_s16(max, v256_and(tap[0], cdef_large_value_mask));
  236|  22.8M|    max = v256_max_s16(max, v256_and(tap[1], cdef_large_value_mask));
  237|  22.8M|    max = v256_max_s16(max, v256_and(tap[2], cdef_large_value_mask));
  238|  22.8M|    max = v256_max_s16(max, v256_and(tap[3], cdef_large_value_mask));
  239|  22.8M|  }
  240|  45.6M|  return max;
  241|  45.6M|}
cdef_block_avx2.c:get_max_secondary:
  244|  49.2M|                                   v256 cdef_large_value_mask) {
  245|  49.2M|  if (is_lowbd) {
  ------------------
  |  Branch (245:7): [True: 24.3M, False: 24.9M]
  ------------------
  246|  24.3M|    v256 max_u8;
  247|  24.3M|    max_u8 = tap[0];
  248|  24.3M|    max_u8 = v256_max_u8(max_u8, tap[1]);
  249|  24.3M|    max_u8 = v256_max_u8(max_u8, tap[2]);
  250|  24.3M|    max_u8 = v256_max_u8(max_u8, tap[3]);
  251|  24.3M|    max_u8 = v256_max_u8(max_u8, tap[4]);
  252|  24.3M|    max_u8 = v256_max_u8(max_u8, tap[5]);
  253|  24.3M|    max_u8 = v256_max_u8(max_u8, tap[6]);
  254|  24.3M|    max_u8 = v256_max_u8(max_u8, tap[7]);
  255|       |    /* The source is 16 bits, however, we only really care about the lower
  256|       |    8 bits.  The upper 8 bits contain the "large" flag.  After the final
  257|       |    primary max has been calculated, zero out the upper 8 bits.  Use this
  258|       |    to find the "16 bit" max. */
  259|  24.3M|    max = v256_max_s16(max, v256_and(max_u8, cdef_large_value_mask));
  260|  24.9M|  } else {
  261|       |    /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
  262|  24.9M|    max = v256_max_s16(max, v256_and(tap[0], cdef_large_value_mask));
  263|  24.9M|    max = v256_max_s16(max, v256_and(tap[1], cdef_large_value_mask));
  264|  24.9M|    max = v256_max_s16(max, v256_and(tap[2], cdef_large_value_mask));
  265|  24.9M|    max = v256_max_s16(max, v256_and(tap[3], cdef_large_value_mask));
  266|  24.9M|    max = v256_max_s16(max, v256_and(tap[4], cdef_large_value_mask));
  267|  24.9M|    max = v256_max_s16(max, v256_and(tap[5], cdef_large_value_mask));
  268|  24.9M|    max = v256_max_s16(max, v256_and(tap[6], cdef_large_value_mask));
  269|  24.9M|    max = v256_max_s16(max, v256_and(tap[7], cdef_large_value_mask));
  270|  24.9M|  }
  271|  49.2M|  return max;
  272|  49.2M|}
cdef_block_avx2.c:filter_block_4x4:
  284|  47.0M|                                  int enable_primary, int enable_secondary) {
  285|  47.0M|  uint8_t *dst8 = (uint8_t *)dest;
  286|  47.0M|  uint16_t *dst16 = (uint16_t *)dest;
  287|  47.0M|  const int clipping_required = enable_primary && enable_secondary;
  ------------------
  |  Branch (287:33): [True: 46.7M, False: 338k]
  |  Branch (287:51): [True: 44.2M, False: 2.45M]
  ------------------
  288|  47.0M|  v256 p0, p1, p2, p3;
  289|  47.0M|  v256 sum, row, res;
  290|  47.0M|  v256 max, min;
  291|  47.0M|  const v256 cdef_large_value_mask = v256_dup_16((uint16_t)~CDEF_VERY_LARGE);
  ------------------
  |  |   30|  47.0M|#define CDEF_VERY_LARGE (0x4000)
  ------------------
  292|  47.0M|  const int po1 = cdef_directions[dir][0];
  293|  47.0M|  const int po2 = cdef_directions[dir][1];
  294|  47.0M|  const int s1o1 = cdef_directions[dir + 2][0];
  295|  47.0M|  const int s1o2 = cdef_directions[dir + 2][1];
  296|  47.0M|  const int s2o1 = cdef_directions[dir - 2][0];
  297|  47.0M|  const int s2o2 = cdef_directions[dir - 2][1];
  298|  47.0M|  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
  299|  47.0M|  const int *sec_taps = cdef_sec_taps;
  300|  47.0M|  int i;
  301|       |
  302|  47.0M|  if (enable_primary && pri_strength)
  ------------------
  |  Branch (302:7): [True: 47.0M, False: 35.7k]
  |  Branch (302:25): [True: 47.0M, False: 18.4E]
  ------------------
  303|  47.0M|    pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
  ------------------
  |  |   35|  47.0M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 5.88M, False: 41.1M]
  |  |  ------------------
  ------------------
  304|  47.0M|  if (enable_secondary && sec_strength)
  ------------------
  |  Branch (304:7): [True: 44.8M, False: 2.24M]
  |  Branch (304:27): [True: 44.8M, False: 18.4E]
  ------------------
  305|  44.8M|    sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
  ------------------
  |  |   35|  44.8M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 44.8M]
  |  |  ------------------
  ------------------
  306|       |
  307|  95.0M|  for (i = 0; i < height; i += 4) {
  ------------------
  |  Branch (307:15): [True: 47.9M, False: 47.0M]
  ------------------
  308|  47.9M|    sum = v256_zero();
  309|  47.9M|    row = v256_from_v64(v64_load_aligned(&in[(i + 0) * CDEF_BSTRIDE]),
  ------------------
  |  |   28|  47.9M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  47.9M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  310|  47.9M|                        v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]),
  ------------------
  |  |   28|  47.9M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  47.9M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  311|  47.9M|                        v64_load_aligned(&in[(i + 2) * CDEF_BSTRIDE]),
  ------------------
  |  |   28|  47.9M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  47.9M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  312|  47.9M|                        v64_load_aligned(&in[(i + 3) * CDEF_BSTRIDE]));
  ------------------
  |  |   28|  47.9M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  47.9M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  313|  47.9M|    max = min = row;
  314|       |
  315|  47.9M|    if (enable_primary) {
  ------------------
  |  Branch (315:9): [True: 46.5M, False: 1.46M]
  ------------------
  316|  46.5M|      v256 tap[4];
  317|       |      // Primary near taps
  318|  46.5M|      tap[0] =
  319|  46.5M|          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + po1]),
  ------------------
  |  |   28|  46.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  46.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  320|  46.5M|                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]),
  ------------------
  |  |   28|  46.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  46.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  321|  46.5M|                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po1]),
  ------------------
  |  |   28|  46.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  46.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  322|  46.5M|                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po1]));
  ------------------
  |  |   28|  46.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  46.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  323|  46.5M|      p0 = constrain16(tap[0], row, pri_strength, pri_damping);
  324|  46.5M|      tap[1] =
  325|  46.5M|          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - po1]),
  ------------------
  |  |   28|  46.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  46.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  326|  46.5M|                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]),
  ------------------
  |  |   28|  46.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  46.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  327|  46.5M|                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po1]),
  ------------------
  |  |   28|  46.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  46.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  328|  46.5M|                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po1]));
  ------------------
  |  |   28|  46.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  46.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  329|  46.5M|      p1 = constrain16(tap[1], row, pri_strength, pri_damping);
  330|       |
  331|       |      // sum += pri_taps[0] * (p0 + p1)
  332|  46.5M|      sum = v256_add_16(
  333|  46.5M|          sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1)));
  334|       |
  335|       |      // Primary far taps
  336|  46.5M|      tap[2] =
  337|  46.5M|          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + po2]),
  ------------------
  |  |   28|  46.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  46.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  338|  46.5M|                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]),
  ------------------
  |  |   28|  46.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  46.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  339|  46.5M|                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po2]),
  ------------------
  |  |   28|  46.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  46.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  340|  46.5M|                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po2]));
  ------------------
  |  |   28|  46.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  46.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  341|  46.5M|      p0 = constrain16(tap[2], row, pri_strength, pri_damping);
  342|  46.5M|      tap[3] =
  343|  46.5M|          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - po2]),
  ------------------
  |  |   28|  46.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  46.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  344|  46.5M|                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]),
  ------------------
  |  |   28|  46.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  46.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  345|  46.5M|                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po2]),
  ------------------
  |  |   28|  46.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  46.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  346|  46.5M|                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po2]));
  ------------------
  |  |   28|  46.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  46.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  347|  46.5M|      p1 = constrain16(tap[3], row, pri_strength, pri_damping);
  348|       |
  349|       |      // sum += pri_taps[1] * (p0 + p1)
  350|  46.5M|      sum = v256_add_16(
  351|  46.5M|          sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1)));
  352|  46.5M|      if (clipping_required) {
  ------------------
  |  Branch (352:11): [True: 33.1M, False: 13.3M]
  ------------------
  353|  33.1M|        max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask);
  354|       |
  355|  33.1M|        min = v256_min_s16(min, tap[0]);
  356|  33.1M|        min = v256_min_s16(min, tap[1]);
  357|  33.1M|        min = v256_min_s16(min, tap[2]);
  358|  33.1M|        min = v256_min_s16(min, tap[3]);
  359|  33.1M|      }
  360|  46.5M|    }
  361|       |
  362|  47.9M|    if (enable_secondary) {
  ------------------
  |  Branch (362:9): [True: 33.5M, False: 14.4M]
  ------------------
  363|  33.5M|      v256 tap[8];
  364|       |      // Secondary near taps
  365|  33.5M|      tap[0] =
  366|  33.5M|          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s1o1]),
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  367|  33.5M|                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]),
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  368|  33.5M|                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o1]),
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  369|  33.5M|                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o1]));
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  370|  33.5M|      p0 = constrain16(tap[0], row, sec_strength, sec_damping);
  371|  33.5M|      tap[1] =
  372|  33.5M|          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s1o1]),
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  373|  33.5M|                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]),
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  374|  33.5M|                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o1]),
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  375|  33.5M|                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o1]));
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  376|  33.5M|      p1 = constrain16(tap[1], row, sec_strength, sec_damping);
  377|  33.5M|      tap[2] =
  378|  33.5M|          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s2o1]),
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  379|  33.5M|                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]),
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  380|  33.5M|                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o1]),
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  381|  33.5M|                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o1]));
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  382|  33.5M|      p2 = constrain16(tap[2], row, sec_strength, sec_damping);
  383|  33.5M|      tap[3] =
  384|  33.5M|          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s2o1]),
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  385|  33.5M|                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]),
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  386|  33.5M|                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o1]),
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  387|  33.5M|                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o1]));
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  388|  33.5M|      p3 = constrain16(tap[3], row, sec_strength, sec_damping);
  389|       |
  390|       |      // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
  391|  33.5M|      sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]),
  392|  33.5M|                                            v256_add_16(v256_add_16(p0, p1),
  393|  33.5M|                                                        v256_add_16(p2, p3))));
  394|       |
  395|       |      // Secondary far taps
  396|  33.5M|      tap[4] =
  397|  33.5M|          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s1o2]),
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  398|  33.5M|                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]),
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  399|  33.5M|                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o2]),
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  400|  33.5M|                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o2]));
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  401|  33.5M|      p0 = constrain16(tap[4], row, sec_strength, sec_damping);
  402|  33.5M|      tap[5] =
  403|  33.5M|          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s1o2]),
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  404|  33.5M|                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]),
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  405|  33.5M|                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o2]),
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  406|  33.5M|                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o2]));
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  407|  33.5M|      p1 = constrain16(tap[5], row, sec_strength, sec_damping);
  408|  33.5M|      tap[6] =
  409|  33.5M|          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s2o2]),
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  410|  33.5M|                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]),
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  411|  33.5M|                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o2]),
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  412|  33.5M|                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o2]));
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  413|  33.5M|      p2 = constrain16(tap[6], row, sec_strength, sec_damping);
  414|  33.5M|      tap[7] =
  415|  33.5M|          v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s2o2]),
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  416|  33.5M|                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]),
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  417|  33.5M|                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o2]),
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  418|  33.5M|                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o2]));
  ------------------
  |  |   28|  33.5M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  33.5M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  419|  33.5M|      p3 = constrain16(tap[7], row, sec_strength, sec_damping);
  420|       |
  421|       |      // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
  422|  33.5M|      sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]),
  423|  33.5M|                                            v256_add_16(v256_add_16(p0, p1),
  424|  33.5M|                                                        v256_add_16(p2, p3))));
  425|       |
  426|  34.4M|      if (clipping_required) {
  ------------------
  |  Branch (426:11): [True: 34.4M, False: 18.4E]
  ------------------
  427|  34.4M|        max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask);
  428|       |
  429|  34.4M|        min = v256_min_s16(min, tap[0]);
  430|  34.4M|        min = v256_min_s16(min, tap[1]);
  431|  34.4M|        min = v256_min_s16(min, tap[2]);
  432|  34.4M|        min = v256_min_s16(min, tap[3]);
  433|  34.4M|        min = v256_min_s16(min, tap[4]);
  434|  34.4M|        min = v256_min_s16(min, tap[5]);
  435|  34.4M|        min = v256_min_s16(min, tap[6]);
  436|  34.4M|        min = v256_min_s16(min, tap[7]);
  437|  34.4M|      }
  438|  33.5M|    }
  439|       |
  440|       |    // res = row + ((sum - (sum < 0) + 8) >> 4)
  441|  47.9M|    sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
  442|  47.9M|    res = v256_add_16(sum, v256_dup_16(8));
  443|  47.9M|    res = v256_shr_n_s16(res, 4);
  ------------------
  |  |  695|  47.9M|#define v256_shr_n_s16(a, c) _mm256_srai_epi16(a, c)
  ------------------
  444|  47.9M|    res = v256_add_16(row, res);
  445|  47.9M|    if (clipping_required) {
  ------------------
  |  Branch (445:9): [True: 34.4M, False: 13.5M]
  ------------------
  446|  34.4M|      res = v256_min_s16(v256_max_s16(res, min), max);
  447|  34.4M|    }
  448|       |
  449|  47.9M|    if (is_lowbd) {
  ------------------
  |  Branch (449:9): [True: 22.5M, False: 25.3M]
  ------------------
  450|  22.5M|      const v128 res_128 = v256_low_v128(v256_pack_s16_u8(res, res));
  451|  22.5M|      u32_store_aligned(&dst8[(i + 0) * dstride],
  452|  22.5M|                        v64_high_u32(v128_high_v64(res_128)));
  453|  22.5M|      u32_store_aligned(&dst8[(i + 1) * dstride],
  454|  22.5M|                        v64_low_u32(v128_high_v64(res_128)));
  455|  22.5M|      u32_store_aligned(&dst8[(i + 2) * dstride],
  456|  22.5M|                        v64_high_u32(v128_low_v64(res_128)));
  457|  22.5M|      u32_store_aligned(&dst8[(i + 3) * dstride],
  458|  22.5M|                        v64_low_u32(v128_low_v64(res_128)));
  459|  25.3M|    } else {
  460|  25.3M|      v64_store_aligned(&dst16[(i + 0) * dstride],
  461|  25.3M|                        v128_high_v64(v256_high_v128(res)));
  462|  25.3M|      v64_store_aligned(&dst16[(i + 1) * dstride],
  463|  25.3M|                        v128_low_v64(v256_high_v128(res)));
  464|  25.3M|      v64_store_aligned(&dst16[(i + 2) * dstride],
  465|  25.3M|                        v128_high_v64(v256_low_v128(res)));
  466|  25.3M|      v64_store_aligned(&dst16[(i + 3) * dstride],
  467|  25.3M|                        v128_low_v64(v256_low_v128(res)));
  468|  25.3M|    }
  469|  47.9M|  }
  470|  47.0M|}
cdef_block_avx2.c:copy_block_8xh:
  664|  5.22M|                                const uint16_t *in, int height) {
  665|  5.22M|  uint8_t *dst8 = (uint8_t *)dest;
  666|  5.22M|  uint16_t *dst16 = (uint16_t *)dest;
  667|  5.22M|  int i;
  668|  26.0M|  for (i = 0; i < height; i += 2) {
  ------------------
  |  Branch (668:15): [True: 20.8M, False: 5.22M]
  ------------------
  669|  20.8M|    const v128 row0 = v128_load_aligned(&in[i * CDEF_BSTRIDE]);
  ------------------
  |  |   28|  20.8M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  20.8M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  670|  20.8M|    const v128 row1 = v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]);
  ------------------
  |  |   28|  20.8M|  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
  |  |  ------------------
  |  |  |  |   69|  20.8M|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  |  |  ------------------
  ------------------
  671|  20.8M|    if (is_lowbd) {
  ------------------
  |  Branch (671:9): [True: 11.0M, False: 9.75M]
  ------------------
  672|       |      /* Note: v128_pack_s16_u8(). The parameter order is swapped internally */
  673|  11.0M|      const v128 res_128 = v128_pack_s16_u8(row1, row0);
  674|  11.0M|      v64_store_aligned(&dst8[i * dstride], v128_low_v64(res_128));
  675|  11.0M|      v64_store_aligned(&dst8[(i + 1) * dstride], v128_high_v64(res_128));
  676|  11.0M|    } else {
  677|  9.75M|      v128_store_unaligned(&dst16[i * dstride], row0);
  678|  9.75M|      v128_store_unaligned(&dst16[(i + 1) * dstride], row1);
  679|  9.75M|    }
  680|  20.8M|  }
  681|  5.22M|}

cfl_init:
   18|   288k|void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params) {
   19|   288k|  assert(block_size_wide[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE);
   20|   288k|  assert(block_size_high[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE);
   21|       |
   22|   288k|  memset(&cfl->recon_buf_q3, 0, sizeof(cfl->recon_buf_q3));
   23|   288k|  memset(&cfl->ac_buf_q3, 0, sizeof(cfl->ac_buf_q3));
   24|   288k|  cfl->subsampling_x = seq_params->subsampling_x;
   25|   288k|  cfl->subsampling_y = seq_params->subsampling_y;
   26|   288k|  cfl->are_parameters_computed = 0;
   27|   288k|  cfl->store_y = 0;
   28|       |  // The DC_PRED cache is disabled by default and is only enabled in
   29|       |  // cfl_rd_pick_alpha
   30|   288k|  clear_cfl_dc_pred_cache_flags(cfl);
   31|   288k|}
av1_cfl_predict_block:
  189|  2.97M|                           TX_SIZE tx_size, int plane) {
  190|  2.97M|  CFL_CTX *const cfl = &xd->cfl;
  191|  2.97M|  MB_MODE_INFO *mbmi = xd->mi[0];
  192|  2.97M|  assert(is_cfl_allowed(xd));
  193|       |
  194|  2.97M|  if (!cfl->are_parameters_computed) cfl_compute_parameters(xd, tx_size);
  ------------------
  |  Branch (194:7): [True: 1.48M, False: 1.48M]
  ------------------
  195|       |
  196|  2.97M|  const int alpha_q3 =
  197|  2.97M|      cfl_idx_to_alpha(mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, plane - 1);
  198|  2.97M|  assert((tx_size_high[tx_size] - 1) * CFL_BUF_LINE + tx_size_wide[tx_size] <=
  199|  2.97M|         CFL_BUF_SQUARE);
  200|  2.97M|#if CONFIG_AV1_HIGHBITDEPTH
  201|  2.97M|  if (is_cur_buf_hbd(xd)) {
  ------------------
  |  Branch (201:7): [True: 1.57M, False: 1.40M]
  ------------------
  202|  1.57M|    uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
  ------------------
  |  |   75|  1.57M|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  203|  1.57M|    cfl_get_predict_hbd_fn(tx_size)(cfl->ac_buf_q3, dst_16, dst_stride,
  204|  1.57M|                                    alpha_q3, xd->bd);
  205|  1.57M|    return;
  206|  1.57M|  }
  207|  1.40M|#endif
  208|  1.40M|  cfl_get_predict_lbd_fn(tx_size)(cfl->ac_buf_q3, dst, dst_stride, alpha_q3);
  209|  1.40M|}
cfl_store_tx:
  391|  2.28M|                  BLOCK_SIZE bsize) {
  392|  2.28M|  CFL_CTX *const cfl = &xd->cfl;
  393|  2.28M|  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
  ------------------
  |  |  226|  2.28M|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  394|  2.28M|  uint8_t *dst = &pd->dst.buf[(row * pd->dst.stride + col) << MI_SIZE_LOG2];
  ------------------
  |  |   39|  2.28M|#define MI_SIZE_LOG2 2
  ------------------
  395|       |
  396|  2.28M|  if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
  ------------------
  |  Branch (396:7): [True: 718k, False: 1.56M]
  |  Branch (396:38): [True: 366k, False: 1.20M]
  ------------------
  397|       |    // Only dimensions of size 4 can have an odd offset.
  398|  1.08M|    assert(!((col & 1) && tx_size_wide[tx_size] != 4));
  399|  1.08M|    assert(!((row & 1) && tx_size_high[tx_size] != 4));
  400|  1.08M|    sub8x8_adjust_offset(cfl, xd->mi_row, xd->mi_col, &row, &col);
  401|  1.08M|  }
  402|  2.28M|  cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size, is_cur_buf_hbd(xd));
  403|  2.28M|}
cfl_store_block:
  421|   745k|void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) {
  422|   745k|  CFL_CTX *const cfl = &xd->cfl;
  423|   745k|  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
  ------------------
  |  |  226|   745k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  424|   745k|  int row = 0;
  425|   745k|  int col = 0;
  426|       |
  427|   745k|  if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
  ------------------
  |  Branch (427:7): [True: 494k, False: 250k]
  |  Branch (427:38): [True: 250k, False: 0]
  ------------------
  428|   745k|    sub8x8_adjust_offset(cfl, xd->mi_row, xd->mi_col, &row, &col);
  429|   745k|  }
  430|   745k|  const int width = max_intra_block_width(xd, bsize, AOM_PLANE_Y, tx_size);
  ------------------
  |  |  226|   745k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  431|   745k|  const int height = max_intra_block_height(xd, bsize, AOM_PLANE_Y, tx_size);
  ------------------
  |  |  226|   745k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  432|   745k|  tx_size = get_tx_size(width, height);
  433|   745k|  cfl_store(cfl, pd->dst.buf, pd->dst.stride, row, col, tx_size,
  434|   745k|            is_cur_buf_hbd(xd));
  435|   745k|}
cfl.c:cfl_compute_parameters:
  178|  1.48M|static void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) {
  179|  1.48M|  CFL_CTX *const cfl = &xd->cfl;
  180|       |  // Do not call cfl_compute_parameters multiple time on the same values.
  181|  1.48M|  assert(cfl->are_parameters_computed == 0);
  182|       |
  183|  1.48M|  cfl_pad(cfl, tx_size_wide[tx_size], tx_size_high[tx_size]);
  184|  1.48M|  cfl_get_subtract_average_fn(tx_size)(cfl->recon_buf_q3, cfl->ac_buf_q3);
  185|  1.48M|  cfl->are_parameters_computed = 1;
  186|  1.48M|}
cfl.c:cfl_pad:
   83|  1.48M|static inline void cfl_pad(CFL_CTX *cfl, int width, int height) {
   84|  1.48M|  const int diff_width = width - cfl->buf_width;
   85|  1.48M|  const int diff_height = height - cfl->buf_height;
   86|       |
   87|  1.48M|  if (diff_width > 0) {
  ------------------
  |  Branch (87:7): [True: 519, False: 1.48M]
  ------------------
   88|    519|    const int min_height = height - diff_height;
   89|    519|    uint16_t *recon_buf_q3 = cfl->recon_buf_q3 + (width - diff_width);
   90|  11.9k|    for (int j = 0; j < min_height; j++) {
  ------------------
  |  Branch (90:21): [True: 11.4k, False: 519]
  ------------------
   91|  11.4k|      const uint16_t last_pixel = recon_buf_q3[-1];
   92|  11.4k|      assert(recon_buf_q3 + diff_width <= cfl->recon_buf_q3 + CFL_BUF_SQUARE);
   93|  98.9k|      for (int i = 0; i < diff_width; i++) {
  ------------------
  |  Branch (93:23): [True: 87.4k, False: 11.4k]
  ------------------
   94|  87.4k|        recon_buf_q3[i] = last_pixel;
   95|  87.4k|      }
   96|  11.4k|      recon_buf_q3 += CFL_BUF_LINE;
  ------------------
  |  |  522|  11.4k|#define CFL_BUF_LINE (32)
  ------------------
   97|  11.4k|    }
   98|    519|    cfl->buf_width = width;
   99|    519|  }
  100|  1.48M|  if (diff_height > 0) {
  ------------------
  |  Branch (100:7): [True: 1.14k, False: 1.48M]
  ------------------
  101|  1.14k|    uint16_t *recon_buf_q3 =
  102|  1.14k|        cfl->recon_buf_q3 + ((height - diff_height) * CFL_BUF_LINE);
  ------------------
  |  |  522|  1.14k|#define CFL_BUF_LINE (32)
  ------------------
  103|  9.40k|    for (int j = 0; j < diff_height; j++) {
  ------------------
  |  Branch (103:21): [True: 8.25k, False: 1.14k]
  ------------------
  104|  8.25k|      const uint16_t *last_row_q3 = recon_buf_q3 - CFL_BUF_LINE;
  ------------------
  |  |  522|  8.25k|#define CFL_BUF_LINE (32)
  ------------------
  105|  8.25k|      assert(recon_buf_q3 + width <= cfl->recon_buf_q3 + CFL_BUF_SQUARE);
  106|  76.3k|      for (int i = 0; i < width; i++) {
  ------------------
  |  Branch (106:23): [True: 68.0k, False: 8.25k]
  ------------------
  107|  68.0k|        recon_buf_q3[i] = last_row_q3[i];
  108|  68.0k|      }
  109|  8.25k|      recon_buf_q3 += CFL_BUF_LINE;
  ------------------
  |  |  522|  8.25k|#define CFL_BUF_LINE (32)
  ------------------
  110|  8.25k|    }
  111|  1.14k|    cfl->buf_height = height;
  112|  1.14k|  }
  113|  1.48M|}
cfl.c:cfl_idx_to_alpha:
  138|  2.97M|                                   CFL_PRED_TYPE pred_type) {
  139|  2.97M|  const int alpha_sign = (pred_type == CFL_PRED_U) ? CFL_SIGN_U(joint_sign)
  ------------------
  |  |  281|  1.48M|#define CFL_SIGN_U(js) (((js + 1) * 11) >> 5)
  ------------------
  |  Branch (139:26): [True: 1.48M, False: 1.48M]
  ------------------
  140|  2.97M|                                                   : CFL_SIGN_V(joint_sign);
  ------------------
  |  |  283|  1.48M|#define CFL_SIGN_V(js) ((js + 1) - CFL_SIGNS * CFL_SIGN_U(js))
  |  |  ------------------
  |  |  |  |  281|  1.48M|#define CFL_SIGN_U(js) (((js + 1) * 11) >> 5)
  |  |  ------------------
  ------------------
  141|  2.97M|  if (alpha_sign == CFL_SIGN_ZERO) return 0;
  ------------------
  |  Branch (141:7): [True: 535k, False: 2.44M]
  ------------------
  142|  2.44M|  const int abs_alpha_q3 =
  143|  2.44M|      (pred_type == CFL_PRED_U) ? CFL_IDX_U(alpha_idx) : CFL_IDX_V(alpha_idx);
  ------------------
  |  |  260|  1.37M|#define CFL_IDX_U(idx) (idx >> CFL_ALPHABET_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |  256|  1.37M|#define CFL_ALPHABET_SIZE_LOG2 4
  |  |  ------------------
  ------------------
                    (pred_type == CFL_PRED_U) ? CFL_IDX_U(alpha_idx) : CFL_IDX_V(alpha_idx);
  ------------------
  |  |  261|  1.06M|#define CFL_IDX_V(idx) (idx & (CFL_ALPHABET_SIZE - 1))
  |  |  ------------------
  |  |  |  |  257|  1.06M|#define CFL_ALPHABET_SIZE (1 << CFL_ALPHABET_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |  256|  1.06M|#define CFL_ALPHABET_SIZE_LOG2 4
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  |  Branch (143:7): [True: 1.37M, False: 1.06M]
  ------------------
  144|  2.44M|  return (alpha_sign == CFL_SIGN_POS) ? abs_alpha_q3 + 1 : -abs_alpha_q3 - 1;
  ------------------
  |  Branch (144:10): [True: 1.13M, False: 1.30M]
  ------------------
  145|  2.97M|}
cfl.c:sub8x8_adjust_offset:
  376|  1.83M|                                        int *col_out) {
  377|       |  // Increment row index for bottom: 8x4, 16x4 or both bottom 4x4s.
  378|  1.83M|  if ((mi_row & 0x01) && cfl->subsampling_y) {
  ------------------
  |  Branch (378:7): [True: 308k, False: 1.52M]
  |  Branch (378:26): [True: 149k, False: 158k]
  ------------------
  379|   149k|    assert(*row_out == 0);
  380|   149k|    (*row_out)++;
  381|   149k|  }
  382|       |
  383|       |  // Increment col index for right: 4x8, 4x16 or both right 4x4s.
  384|  1.83M|  if ((mi_col & 0x01) && cfl->subsampling_x) {
  ------------------
  |  Branch (384:7): [True: 265k, False: 1.56M]
  |  Branch (384:26): [True: 144k, False: 121k]
  ------------------
  385|   144k|    assert(*col_out == 0);
  386|   144k|    (*col_out)++;
  387|   144k|  }
  388|  1.83M|}
cfl.c:cfl_store:
  326|  3.03M|                      int row, int col, TX_SIZE tx_size, int use_hbd) {
  327|  3.03M|  const int width = tx_size_wide[tx_size];
  328|  3.03M|  const int height = tx_size_high[tx_size];
  329|  3.03M|  const int tx_off_log2 = MI_SIZE_LOG2;
  ------------------
  |  |   39|  3.03M|#define MI_SIZE_LOG2 2
  ------------------
  330|  3.03M|  const int sub_x = cfl->subsampling_x;
  331|  3.03M|  const int sub_y = cfl->subsampling_y;
  332|  3.03M|  const int store_row = row << (tx_off_log2 - sub_y);
  333|  3.03M|  const int store_col = col << (tx_off_log2 - sub_x);
  334|  3.03M|  const int store_height = height >> sub_y;
  335|  3.03M|  const int store_width = width >> sub_x;
  336|       |
  337|       |  // Invalidate current parameters
  338|  3.03M|  cfl->are_parameters_computed = 0;
  339|       |
  340|       |  // Store the surface of the pixel buffer that was written to, this way we
  341|       |  // can manage chroma overrun (e.g. when the chroma surfaces goes beyond the
  342|       |  // frame boundary)
  343|  3.03M|  if (col == 0 && row == 0) {
  ------------------
  |  Branch (343:7): [True: 2.71M, False: 319k]
  |  Branch (343:19): [True: 2.44M, False: 267k]
  ------------------
  344|  2.44M|    cfl->buf_width = store_width;
  345|  2.44M|    cfl->buf_height = store_height;
  346|  2.44M|  } else {
  347|   586k|    cfl->buf_width = OD_MAXI(store_col + store_width, cfl->buf_width);
  ------------------
  |  |   45|   586k|#define OD_MAXI AOMMAX
  |  |  ------------------
  |  |  |  |   35|   586k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (35:23): [True: 223k, False: 363k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  348|   586k|    cfl->buf_height = OD_MAXI(store_row + store_height, cfl->buf_height);
  ------------------
  |  |   45|   586k|#define OD_MAXI AOMMAX
  |  |  ------------------
  |  |  |  |   35|   586k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (35:23): [True: 267k, False: 319k]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  349|   586k|  }
  350|       |
  351|       |  // Check that we will remain inside the pixel buffer.
  352|  3.03M|  assert(store_row + store_height <= CFL_BUF_LINE);
  353|  3.03M|  assert(store_col + store_width <= CFL_BUF_LINE);
  354|       |
  355|       |  // Store the input into the CfL pixel buffer
  356|  3.03M|  uint16_t *recon_buf_q3 =
  357|  3.03M|      cfl->recon_buf_q3 + (store_row * CFL_BUF_LINE + store_col);
  ------------------
  |  |  522|  3.03M|#define CFL_BUF_LINE (32)
  ------------------
  358|  3.03M|#if CONFIG_AV1_HIGHBITDEPTH
  359|  3.03M|  if (use_hbd) {
  ------------------
  |  Branch (359:7): [True: 1.64M, False: 1.38M]
  ------------------
  360|  1.64M|    cfl_subsampling_hbd(tx_size, sub_x, sub_y)(CONVERT_TO_SHORTPTR(input),
  ------------------
  |  |   75|  1.64M|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  361|  1.64M|                                               input_stride, recon_buf_q3);
  362|  1.64M|  } else {
  363|  1.38M|    cfl_subsampling_lbd(tx_size, sub_x, sub_y)(input, input_stride,
  364|  1.38M|                                               recon_buf_q3);
  365|  1.38M|  }
  366|       |#else
  367|       |  (void)use_hbd;
  368|       |  cfl_subsampling_lbd(tx_size, sub_x, sub_y)(input, input_stride, recon_buf_q3);
  369|       |#endif
  370|  3.03M|}
cfl.c:cfl_subsampling_hbd:
  303|  1.64M|                                                       int sub_x, int sub_y) {
  304|  1.64M|  if (sub_x == 1) {
  ------------------
  |  Branch (304:7): [True: 880k, False: 767k]
  ------------------
  305|   880k|    if (sub_y == 1) {
  ------------------
  |  Branch (305:9): [True: 878k, False: 1.88k]
  ------------------
  306|   878k|      return cfl_get_luma_subsampling_420_hbd(tx_size);
  307|   878k|    }
  308|  1.88k|    return cfl_get_luma_subsampling_422_hbd(tx_size);
  309|   880k|  }
  310|   767k|  return cfl_get_luma_subsampling_444_hbd(tx_size);
  311|  1.64M|}
cfl.c:cfl_subsampling_lbd:
  315|  1.38M|                                                       int sub_x, int sub_y) {
  316|  1.38M|  if (sub_x == 1) {
  ------------------
  |  Branch (316:7): [True: 963k, False: 421k]
  ------------------
  317|   963k|    if (sub_y == 1) {
  ------------------
  |  Branch (317:9): [True: 961k, False: 2.22k]
  ------------------
  318|   961k|      return cfl_get_luma_subsampling_420_lbd(tx_size);
  319|   961k|    }
  320|  2.22k|    return cfl_get_luma_subsampling_422_lbd(tx_size);
  321|   963k|  }
  322|   421k|  return cfl_get_luma_subsampling_444_lbd(tx_size);
  323|  1.38M|}
cfl.c:max_intra_block_width:
  407|   745k|                                        TX_SIZE tx_size) {
  408|   745k|  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane)
  409|   745k|                              << MI_SIZE_LOG2;
  ------------------
  |  |   39|   745k|#define MI_SIZE_LOG2 2
  ------------------
  410|   745k|  return ALIGN_POWER_OF_TWO(max_blocks_wide, tx_size_wide_log2[tx_size]);
  ------------------
  |  |   69|   745k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
  411|   745k|}
cfl.c:max_intra_block_height:
  415|   745k|                                         TX_SIZE tx_size) {
  416|   745k|  const int max_blocks_high = max_block_high(xd, plane_bsize, plane)
  417|   745k|                              << MI_SIZE_LOG2;
  ------------------
  |  |   39|   745k|#define MI_SIZE_LOG2 2
  ------------------
  418|   745k|  return ALIGN_POWER_OF_TWO(max_blocks_high, tx_size_high_log2[tx_size]);
  ------------------
  |  |   69|   745k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
  419|   745k|}

decodeframe.c:store_cfl_required:
   39|  43.3M|                                                  const MACROBLOCKD *xd) {
   40|  43.3M|  const MB_MODE_INFO *mbmi = xd->mi[0];
   41|       |
   42|  43.3M|  if (cm->seq_params->monochrome) return CFL_DISALLOWED;
  ------------------
  |  Branch (42:7): [True: 1.11M, False: 42.2M]
  ------------------
   43|       |
   44|  42.2M|  if (!xd->is_chroma_ref) {
  ------------------
  |  Branch (44:7): [True: 1.27M, False: 40.9M]
  ------------------
   45|       |    // For non-chroma-reference blocks, we should always store the luma pixels,
   46|       |    // in case the corresponding chroma-reference block uses CfL.
   47|       |    // Note that this can only happen for block sizes which are <8 on
   48|       |    // their shortest side, as otherwise they would be chroma reference
   49|       |    // blocks.
   50|  1.27M|    return CFL_ALLOWED;
   51|  1.27M|  }
   52|       |
   53|       |  // If this block has chroma information, we know whether we're
   54|       |  // actually going to perform a CfL prediction
   55|  40.9M|  return (CFL_ALLOWED_TYPE)(!is_inter_block(mbmi) &&
  ------------------
  |  Branch (55:29): [True: 37.4M, False: 3.47M]
  ------------------
   56|  40.9M|                            mbmi->uv_mode == UV_CFL_PRED);
  ------------------
  |  Branch (56:29): [True: 1.75M, False: 35.7M]
  ------------------
   57|  42.2M|}
decodemv.c:is_cfl_allowed:
   19|  9.87M|static inline CFL_ALLOWED_TYPE is_cfl_allowed(const MACROBLOCKD *xd) {
   20|  9.87M|  const MB_MODE_INFO *mbmi = xd->mi[0];
   21|  9.87M|  const BLOCK_SIZE bsize = mbmi->bsize;
   22|  9.87M|  assert(bsize < BLOCK_SIZES_ALL);
   23|  9.87M|  if (xd->lossless[mbmi->segment_id]) {
  ------------------
  |  Branch (23:7): [True: 88.7k, False: 9.78M]
  ------------------
   24|       |    // In lossless, CfL is available when the partition size is equal to the
   25|       |    // transform size.
   26|  88.7k|    const int ssx = xd->plane[AOM_PLANE_U].subsampling_x;
  ------------------
  |  |  227|  88.7k|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
   27|  88.7k|    const int ssy = xd->plane[AOM_PLANE_U].subsampling_y;
  ------------------
  |  |  227|  88.7k|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
   28|  88.7k|    const int plane_bsize = get_plane_block_size(bsize, ssx, ssy);
   29|  88.7k|    return (CFL_ALLOWED_TYPE)(plane_bsize == BLOCK_4X4);
   30|  88.7k|  }
   31|       |  // Spec: CfL is available to luma partitions lesser than or equal to 32x32
   32|  9.78M|  return (CFL_ALLOWED_TYPE)(block_size_wide[bsize] <= 32 &&
  ------------------
  |  Branch (32:29): [True: 8.86M, False: 917k]
  ------------------
   33|  9.78M|                            block_size_high[bsize] <= 32);
  ------------------
  |  Branch (33:29): [True: 8.16M, False: 704k]
  ------------------
   34|  9.87M|}
decodemv.c:store_cfl_required:
   39|  15.5M|                                                  const MACROBLOCKD *xd) {
   40|  15.5M|  const MB_MODE_INFO *mbmi = xd->mi[0];
   41|       |
   42|  15.5M|  if (cm->seq_params->monochrome) return CFL_DISALLOWED;
  ------------------
  |  Branch (42:7): [True: 604k, False: 14.9M]
  ------------------
   43|       |
   44|  14.9M|  if (!xd->is_chroma_ref) {
  ------------------
  |  Branch (44:7): [True: 1.31M, False: 13.6M]
  ------------------
   45|       |    // For non-chroma-reference blocks, we should always store the luma pixels,
   46|       |    // in case the corresponding chroma-reference block uses CfL.
   47|       |    // Note that this can only happen for block sizes which are <8 on
   48|       |    // their shortest side, as otherwise they would be chroma reference
   49|       |    // blocks.
   50|  1.31M|    return CFL_ALLOWED;
   51|  1.31M|  }
   52|       |
   53|       |  // If this block has chroma information, we know whether we're
   54|       |  // actually going to perform a CfL prediction
   55|  13.6M|  return (CFL_ALLOWED_TYPE)(!is_inter_block(mbmi) &&
  ------------------
  |  Branch (55:29): [True: 9.87M, False: 3.74M]
  ------------------
   56|  13.6M|                            mbmi->uv_mode == UV_CFL_PRED);
  ------------------
  |  Branch (56:29): [True: 1.54M, False: 8.32M]
  ------------------
   57|  14.9M|}
cfl.c:clear_cfl_dc_pred_cache_flags:
   69|   287k|static inline void clear_cfl_dc_pred_cache_flags(CFL_CTX *cfl) {
   70|   287k|  cfl->use_dc_pred_cache = false;
   71|   287k|  cfl->dc_pred_is_cached[CFL_PRED_U] = false;
   72|   287k|  cfl->dc_pred_is_cached[CFL_PRED_V] = false;
   73|   287k|}
cfl.c:is_cfl_allowed:
   19|  2.97M|static inline CFL_ALLOWED_TYPE is_cfl_allowed(const MACROBLOCKD *xd) {
   20|  2.97M|  const MB_MODE_INFO *mbmi = xd->mi[0];
   21|  2.97M|  const BLOCK_SIZE bsize = mbmi->bsize;
   22|  2.97M|  assert(bsize < BLOCK_SIZES_ALL);
   23|  2.97M|  if (xd->lossless[mbmi->segment_id]) {
  ------------------
  |  Branch (23:7): [True: 1.88k, False: 2.97M]
  ------------------
   24|       |    // In lossless, CfL is available when the partition size is equal to the
   25|       |    // transform size.
   26|  1.88k|    const int ssx = xd->plane[AOM_PLANE_U].subsampling_x;
  ------------------
  |  |  227|  1.88k|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
   27|  1.88k|    const int ssy = xd->plane[AOM_PLANE_U].subsampling_y;
  ------------------
  |  |  227|  1.88k|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
   28|  1.88k|    const int plane_bsize = get_plane_block_size(bsize, ssx, ssy);
   29|  1.88k|    return (CFL_ALLOWED_TYPE)(plane_bsize == BLOCK_4X4);
   30|  1.88k|  }
   31|       |  // Spec: CfL is available to luma partitions lesser than or equal to 32x32
   32|  2.97M|  return (CFL_ALLOWED_TYPE)(block_size_wide[bsize] <= 32 &&
  ------------------
  |  Branch (32:29): [True: 2.97M, False: 0]
  ------------------
   33|  2.97M|                            block_size_high[bsize] <= 32);
  ------------------
  |  Branch (33:29): [True: 2.97M, False: 0]
  ------------------
   34|  2.97M|}
reconintra.c:get_cfl_pred_type:
   64|  2.97M|static inline CFL_PRED_TYPE get_cfl_pred_type(int plane) {
   65|  2.97M|  assert(plane > 0);
   66|  2.97M|  return (CFL_PRED_TYPE)(plane - 1);
   67|  2.97M|}
cfl_subtract_average_4x4_sse2:
  178|   269k|                                                        int16_t *dst) {      \
  179|   269k|    subtract_average_##arch(src, dst, width, height, round_offset,           \
  180|   269k|                            num_pel_log2);                                   \
  181|   269k|  }
cfl_subtract_average_4x8_sse2:
  178|   109k|                                                        int16_t *dst) {      \
  179|   109k|    subtract_average_##arch(src, dst, width, height, round_offset,           \
  180|   109k|                            num_pel_log2);                                   \
  181|   109k|  }
cfl_subtract_average_4x16_sse2:
  178|  91.2k|                                                        int16_t *dst) {      \
  179|  91.2k|    subtract_average_##arch(src, dst, width, height, round_offset,           \
  180|  91.2k|                            num_pel_log2);                                   \
  181|  91.2k|  }
cfl_subtract_average_8x4_sse2:
  178|   173k|                                                        int16_t *dst) {      \
  179|   173k|    subtract_average_##arch(src, dst, width, height, round_offset,           \
  180|   173k|                            num_pel_log2);                                   \
  181|   173k|  }
cfl_subtract_average_8x8_sse2:
  178|   234k|                                                        int16_t *dst) {      \
  179|   234k|    subtract_average_##arch(src, dst, width, height, round_offset,           \
  180|   234k|                            num_pel_log2);                                   \
  181|   234k|  }
cfl_subtract_average_8x16_sse2:
  178|  84.2k|                                                        int16_t *dst) {      \
  179|  84.2k|    subtract_average_##arch(src, dst, width, height, round_offset,           \
  180|  84.2k|                            num_pel_log2);                                   \
  181|  84.2k|  }
cfl_subtract_average_8x32_sse2:
  178|  45.6k|                                                        int16_t *dst) {      \
  179|  45.6k|    subtract_average_##arch(src, dst, width, height, round_offset,           \
  180|  45.6k|                            num_pel_log2);                                   \
  181|  45.6k|  }
cfl_subsample_lbd_420_4x4_ssse3:
  101|   197k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|   197k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|   197k|                                               output_q3, width, height); \
  104|   197k|  }
cfl_subsample_lbd_420_8x8_ssse3:
  101|  87.3k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  87.3k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  87.3k|                                               output_q3, width, height); \
  104|  87.3k|  }
cfl_subsample_lbd_420_16x16_ssse3:
  101|  48.0k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  48.0k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  48.0k|                                               output_q3, width, height); \
  104|  48.0k|  }
cfl_subsample_lbd_420_4x8_ssse3:
  101|  95.4k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  95.4k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  95.4k|                                               output_q3, width, height); \
  104|  95.4k|  }
cfl_subsample_lbd_420_8x4_ssse3:
  101|   119k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|   119k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|   119k|                                               output_q3, width, height); \
  104|   119k|  }
cfl_subsample_lbd_420_8x16_ssse3:
  101|  24.8k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  24.8k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  24.8k|                                               output_q3, width, height); \
  104|  24.8k|  }
cfl_subsample_lbd_420_16x8_ssse3:
  101|  39.5k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  39.5k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  39.5k|                                               output_q3, width, height); \
  104|  39.5k|  }
cfl_subsample_lbd_420_16x32_ssse3:
  101|  8.56k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  8.56k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  8.56k|                                               output_q3, width, height); \
  104|  8.56k|  }
cfl_subsample_lbd_420_4x16_ssse3:
  101|   126k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|   126k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|   126k|                                               output_q3, width, height); \
  104|   126k|  }
cfl_subsample_lbd_420_16x4_ssse3:
  101|   157k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|   157k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|   157k|                                               output_q3, width, height); \
  104|   157k|  }
cfl_subsample_lbd_420_8x32_ssse3:
  101|  10.6k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  10.6k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  10.6k|                                               output_q3, width, height); \
  104|  10.6k|  }
cfl_subsample_lbd_422_4x4_ssse3:
  101|    668|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|    668|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|    668|                                               output_q3, width, height); \
  104|    668|  }
cfl_subsample_lbd_422_8x8_ssse3:
  101|    395|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|    395|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|    395|                                               output_q3, width, height); \
  104|    395|  }
cfl_subsample_lbd_422_16x16_ssse3:
  101|    261|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|    261|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|    261|                                               output_q3, width, height); \
  104|    261|  }
cfl_subsample_lbd_422_8x4_ssse3:
  101|    100|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|    100|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|    100|                                               output_q3, width, height); \
  104|    100|  }
cfl_subsample_lbd_422_16x8_ssse3:
  101|    178|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|    178|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|    178|                                               output_q3, width, height); \
  104|    178|  }
cfl_subsample_lbd_422_16x4_ssse3:
  101|    197|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|    197|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|    197|                                               output_q3, width, height); \
  104|    197|  }
cfl_subsample_lbd_444_4x4_ssse3:
  101|  70.9k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  70.9k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  70.9k|                                               output_q3, width, height); \
  104|  70.9k|  }
cfl_subsample_lbd_444_8x8_ssse3:
  101|  77.9k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  77.9k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  77.9k|                                               output_q3, width, height); \
  104|  77.9k|  }
cfl_subsample_lbd_444_16x16_ssse3:
  101|  37.6k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  37.6k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  37.6k|                                               output_q3, width, height); \
  104|  37.6k|  }
cfl_subsample_lbd_444_4x8_ssse3:
  101|  15.4k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  15.4k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  15.4k|                                               output_q3, width, height); \
  104|  15.4k|  }
cfl_subsample_lbd_444_8x4_ssse3:
  101|  24.5k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  24.5k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  24.5k|                                               output_q3, width, height); \
  104|  24.5k|  }
cfl_subsample_lbd_444_8x16_ssse3:
  101|  23.0k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  23.0k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  23.0k|                                               output_q3, width, height); \
  104|  23.0k|  }
cfl_subsample_lbd_444_16x8_ssse3:
  101|  32.1k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  32.1k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  32.1k|                                               output_q3, width, height); \
  104|  32.1k|  }
cfl_subsample_lbd_444_16x32_ssse3:
  101|  10.2k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  10.2k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  10.2k|                                               output_q3, width, height); \
  104|  10.2k|  }
cfl_subsample_lbd_444_4x16_ssse3:
  101|  29.8k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  29.8k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  29.8k|                                               output_q3, width, height); \
  104|  29.8k|  }
cfl_subsample_lbd_444_16x4_ssse3:
  101|  38.3k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  38.3k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  38.3k|                                               output_q3, width, height); \
  104|  38.3k|  }
cfl_subsample_lbd_444_8x32_ssse3:
  101|  18.0k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  18.0k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  18.0k|                                               output_q3, width, height); \
  104|  18.0k|  }
cfl_subsample_hbd_420_4x4_ssse3:
  101|   162k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|   162k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|   162k|                                               output_q3, width, height); \
  104|   162k|  }
cfl_subsample_hbd_420_8x8_ssse3:
  101|  70.8k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  70.8k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  70.8k|                                               output_q3, width, height); \
  104|  70.8k|  }
cfl_subsample_hbd_420_16x16_ssse3:
  101|  28.7k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  28.7k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  28.7k|                                               output_q3, width, height); \
  104|  28.7k|  }
cfl_subsample_hbd_420_4x8_ssse3:
  101|   104k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|   104k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|   104k|                                               output_q3, width, height); \
  104|   104k|  }
cfl_subsample_hbd_420_8x4_ssse3:
  101|   145k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|   145k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|   145k|                                               output_q3, width, height); \
  104|   145k|  }
cfl_subsample_hbd_420_8x16_ssse3:
  101|  11.7k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  11.7k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  11.7k|                                               output_q3, width, height); \
  104|  11.7k|  }
cfl_subsample_hbd_420_16x8_ssse3:
  101|  26.2k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  26.2k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  26.2k|                                               output_q3, width, height); \
  104|  26.2k|  }
cfl_subsample_hbd_420_16x32_ssse3:
  101|  4.51k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  4.51k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  4.51k|                                               output_q3, width, height); \
  104|  4.51k|  }
cfl_subsample_hbd_420_4x16_ssse3:
  101|   124k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|   124k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|   124k|                                               output_q3, width, height); \
  104|   124k|  }
cfl_subsample_hbd_420_16x4_ssse3:
  101|   163k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|   163k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|   163k|                                               output_q3, width, height); \
  104|   163k|  }
cfl_subsample_hbd_420_8x32_ssse3:
  101|  9.08k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  9.08k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  9.08k|                                               output_q3, width, height); \
  104|  9.08k|  }
cfl_subsample_hbd_422_4x4_ssse3:
  101|  1.22k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  1.22k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  1.22k|                                               output_q3, width, height); \
  104|  1.22k|  }
cfl_subsample_hbd_422_8x8_ssse3:
  101|    228|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|    228|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|    228|                                               output_q3, width, height); \
  104|    228|  }
cfl_subsample_hbd_422_16x16_ssse3:
  101|     37|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|     37|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|     37|                                               output_q3, width, height); \
  104|     37|  }
cfl_subsample_hbd_422_8x4_ssse3:
  101|    119|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|    119|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|    119|                                               output_q3, width, height); \
  104|    119|  }
cfl_subsample_hbd_422_16x8_ssse3:
  101|    109|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|    109|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|    109|                                               output_q3, width, height); \
  104|    109|  }
cfl_subsample_hbd_422_16x4_ssse3:
  101|     44|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|     44|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|     44|                                               output_q3, width, height); \
  104|     44|  }
cfl_subsample_hbd_444_4x4_ssse3:
  101|   143k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|   143k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|   143k|                                               output_q3, width, height); \
  104|   143k|  }
cfl_subsample_hbd_444_8x8_ssse3:
  101|   205k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|   205k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|   205k|                                               output_q3, width, height); \
  104|   205k|  }
cfl_subsample_hbd_444_16x16_ssse3:
  101|  43.0k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  43.0k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  43.0k|                                               output_q3, width, height); \
  104|  43.0k|  }
cfl_subsample_hbd_444_4x8_ssse3:
  101|  43.5k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  43.5k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  43.5k|                                               output_q3, width, height); \
  104|  43.5k|  }
cfl_subsample_hbd_444_8x4_ssse3:
  101|  71.4k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  71.4k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  71.4k|                                               output_q3, width, height); \
  104|  71.4k|  }
cfl_subsample_hbd_444_8x16_ssse3:
  101|  49.0k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  49.0k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  49.0k|                                               output_q3, width, height); \
  104|  49.0k|  }
cfl_subsample_hbd_444_16x8_ssse3:
  101|  53.6k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  53.6k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  53.6k|                                               output_q3, width, height); \
  104|  53.6k|  }
cfl_subsample_hbd_444_16x32_ssse3:
  101|  7.06k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  7.06k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  7.06k|                                               output_q3, width, height); \
  104|  7.06k|  }
cfl_subsample_hbd_444_4x16_ssse3:
  101|  30.2k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  30.2k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  30.2k|                                               output_q3, width, height); \
  104|  30.2k|  }
cfl_subsample_hbd_444_16x4_ssse3:
  101|  43.2k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  43.2k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  43.2k|                                               output_q3, width, height); \
  104|  43.2k|  }
cfl_subsample_hbd_444_8x32_ssse3:
  101|  15.8k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  15.8k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  15.8k|                                               output_q3, width, height); \
  104|  15.8k|  }
cfl_predict_lbd_4x4_ssse3:
  232|   305k|      int alpha_q3) {                                                          \
  233|   305k|    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
  234|   305k|                           height);                                            \
  235|   305k|  }
cfl_predict_lbd_4x8_ssse3:
  232|   112k|      int alpha_q3) {                                                          \
  233|   112k|    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
  234|   112k|                           height);                                            \
  235|   112k|  }
cfl_predict_lbd_4x16_ssse3:
  232|  82.7k|      int alpha_q3) {                                                          \
  233|  82.7k|    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
  234|  82.7k|                           height);                                            \
  235|  82.7k|  }
cfl_predict_lbd_8x4_ssse3:
  232|   164k|      int alpha_q3) {                                                          \
  233|   164k|    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
  234|   164k|                           height);                                            \
  235|   164k|  }
cfl_predict_lbd_8x8_ssse3:
  232|   210k|      int alpha_q3) {                                                          \
  233|   210k|    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
  234|   210k|                           height);                                            \
  235|   210k|  }
cfl_predict_lbd_8x16_ssse3:
  232|  64.1k|      int alpha_q3) {                                                          \
  233|  64.1k|    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
  234|  64.1k|                           height);                                            \
  235|  64.1k|  }
cfl_predict_lbd_8x32_ssse3:
  232|  44.0k|      int alpha_q3) {                                                          \
  233|  44.0k|    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
  234|  44.0k|                           height);                                            \
  235|  44.0k|  }
cfl_predict_lbd_16x4_ssse3:
  232|   106k|      int alpha_q3) {                                                          \
  233|   106k|    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
  234|   106k|                           height);                                            \
  235|   106k|  }
cfl_predict_lbd_16x8_ssse3:
  232|  83.9k|      int alpha_q3) {                                                          \
  233|  83.9k|    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
  234|  83.9k|                           height);                                            \
  235|  83.9k|  }
cfl_predict_lbd_16x16_ssse3:
  232|   124k|      int alpha_q3) {                                                          \
  233|   124k|    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
  234|   124k|                           height);                                            \
  235|   124k|  }
cfl_predict_lbd_16x32_ssse3:
  232|  20.9k|      int alpha_q3) {                                                          \
  233|  20.9k|    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
  234|  20.9k|                           height);                                            \
  235|  20.9k|  }
cfl_predict_hbd_4x4_ssse3:
  244|   233k|      int bd) {                                                                \
  245|   233k|    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
  246|   233k|                           height);                                            \
  247|   233k|  }
cfl_predict_hbd_4x8_ssse3:
  244|   105k|      int bd) {                                                                \
  245|   105k|    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
  246|   105k|                           height);                                            \
  247|   105k|  }
cfl_predict_hbd_4x16_ssse3:
  244|  99.7k|      int bd) {                                                                \
  245|  99.7k|    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
  246|  99.7k|                           height);                                            \
  247|  99.7k|  }
cfl_predict_hbd_8x4_ssse3:
  244|   183k|      int bd) {                                                                \
  245|   183k|    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
  246|   183k|                           height);                                            \
  247|   183k|  }
cfl_predict_hbd_8x8_ssse3:
  244|   258k|      int bd) {                                                                \
  245|   258k|    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
  246|   258k|                           height);                                            \
  247|   258k|  }
cfl_predict_hbd_8x16_ssse3:
  244|   104k|      int bd) {                                                                \
  245|   104k|    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
  246|   104k|                           height);                                            \
  247|   104k|  }
cfl_predict_hbd_8x32_ssse3:
  244|  47.3k|      int bd) {                                                                \
  245|  47.3k|    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
  246|  47.3k|                           height);                                            \
  247|  47.3k|  }
cfl_subsample_lbd_420_32x32_avx2:
  101|  23.5k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  23.5k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  23.5k|                                               output_q3, width, height); \
  104|  23.5k|  }
cfl_subsample_lbd_420_32x16_avx2:
  101|  8.71k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  8.71k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  8.71k|                                               output_q3, width, height); \
  104|  8.71k|  }
cfl_subsample_lbd_420_32x8_avx2:
  101|  14.4k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  14.4k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  14.4k|                                               output_q3, width, height); \
  104|  14.4k|  }
cfl_subsample_lbd_422_32x32_avx2:
  101|     90|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|     90|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|     90|                                               output_q3, width, height); \
  104|     90|  }
cfl_subsample_lbd_422_32x16_avx2:
  101|    181|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|    181|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|    181|                                               output_q3, width, height); \
  104|    181|  }
cfl_subsample_lbd_422_32x8_avx2:
  101|    154|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|    154|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|    154|                                               output_q3, width, height); \
  104|    154|  }
cfl_subsample_lbd_444_32x32_avx2:
  101|  17.7k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  17.7k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  17.7k|                                               output_q3, width, height); \
  104|  17.7k|  }
cfl_subsample_lbd_444_32x16_avx2:
  101|  9.70k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  9.70k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  9.70k|                                               output_q3, width, height); \
  104|  9.70k|  }
cfl_subsample_lbd_444_32x8_avx2:
  101|  15.8k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  15.8k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  15.8k|                                               output_q3, width, height); \
  104|  15.8k|  }
cfl_subsample_hbd_420_32x32_avx2:
  101|  13.4k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  13.4k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  13.4k|                                               output_q3, width, height); \
  104|  13.4k|  }
cfl_subsample_hbd_420_32x16_avx2:
  101|  4.95k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  4.95k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  4.95k|                                               output_q3, width, height); \
  104|  4.95k|  }
cfl_subsample_hbd_420_32x8_avx2:
  101|  9.08k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  9.08k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  9.08k|                                               output_q3, width, height); \
  104|  9.08k|  }
cfl_subsample_hbd_422_32x32_avx2:
  101|     16|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|     16|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|     16|                                               output_q3, width, height); \
  104|     16|  }
cfl_subsample_hbd_422_32x16_avx2:
  101|     56|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|     56|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|     56|                                               output_q3, width, height); \
  104|     56|  }
cfl_subsample_hbd_422_32x8_avx2:
  101|     47|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|     47|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|     47|                                               output_q3, width, height); \
  104|     47|  }
cfl_subsample_hbd_444_32x32_avx2:
  101|  18.5k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  18.5k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  18.5k|                                               output_q3, width, height); \
  104|  18.5k|  }
cfl_subsample_hbd_444_32x16_avx2:
  101|  12.6k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  12.6k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  12.6k|                                               output_q3, width, height); \
  104|  12.6k|  }
cfl_subsample_hbd_444_32x8_avx2:
  101|  30.0k|      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
  102|  30.0k|    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
  103|  30.0k|                                               output_q3, width, height); \
  104|  30.0k|  }
cfl_predict_lbd_32x8_avx2:
  232|  31.7k|      int alpha_q3) {                                                          \
  233|  31.7k|    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
  234|  31.7k|                           height);                                            \
  235|  31.7k|  }
cfl_predict_lbd_32x16_avx2:
  232|  19.6k|      int alpha_q3) {                                                          \
  233|  19.6k|    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
  234|  19.6k|                           height);                                            \
  235|  19.6k|  }
cfl_predict_lbd_32x32_avx2:
  232|  35.8k|      int alpha_q3) {                                                          \
  233|  35.8k|    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,      \
  234|  35.8k|                           height);                                            \
  235|  35.8k|  }
cfl_predict_hbd_16x4_avx2:
  244|   130k|      int bd) {                                                                \
  245|   130k|    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
  246|   130k|                           height);                                            \
  247|   130k|  }
cfl_predict_hbd_16x8_avx2:
  244|   122k|      int bd) {                                                                \
  245|   122k|    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
  246|   122k|                           height);                                            \
  247|   122k|  }
cfl_predict_hbd_16x16_avx2:
  244|   138k|      int bd) {                                                                \
  245|   138k|    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
  246|   138k|                           height);                                            \
  247|   138k|  }
cfl_predict_hbd_16x32_avx2:
  244|  19.5k|      int bd) {                                                                \
  245|  19.5k|    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
  246|  19.5k|                           height);                                            \
  247|  19.5k|  }
cfl_predict_hbd_32x8_avx2:
  244|  62.2k|      int bd) {                                                                \
  245|  62.2k|    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
  246|  62.2k|                           height);                                            \
  247|  62.2k|  }
cfl_predict_hbd_32x16_avx2:
  244|  25.7k|      int bd) {                                                                \
  245|  25.7k|    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
  246|  25.7k|                           height);                                            \
  247|  25.7k|  }
cfl_predict_hbd_32x32_avx2:
  244|  38.9k|      int bd) {                                                                \
  245|  38.9k|    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
  246|  38.9k|                           height);                                            \
  247|  38.9k|  }
cfl_subtract_average_16x4_avx2:
  178|   118k|                                                        int16_t *dst) {      \
  179|   118k|    subtract_average_##arch(src, dst, width, height, round_offset,           \
  180|   118k|                            num_pel_log2);                                   \
  181|   118k|  }
cfl_subtract_average_16x8_avx2:
  178|   103k|                                                        int16_t *dst) {      \
  179|   103k|    subtract_average_##arch(src, dst, width, height, round_offset,           \
  180|   103k|                            num_pel_log2);                                   \
  181|   103k|  }
cfl_subtract_average_16x16_avx2:
  178|   131k|                                                        int16_t *dst) {      \
  179|   131k|    subtract_average_##arch(src, dst, width, height, round_offset,           \
  180|   131k|                            num_pel_log2);                                   \
  181|   131k|  }
cfl_subtract_average_16x32_avx2:
  178|  20.2k|                                                        int16_t *dst) {      \
  179|  20.2k|    subtract_average_##arch(src, dst, width, height, round_offset,           \
  180|  20.2k|                            num_pel_log2);                                   \
  181|  20.2k|  }
cfl_subtract_average_32x8_avx2:
  178|  47.0k|                                                        int16_t *dst) {      \
  179|  47.0k|    subtract_average_##arch(src, dst, width, height, round_offset,           \
  180|  47.0k|                            num_pel_log2);                                   \
  181|  47.0k|  }
cfl_subtract_average_32x16_avx2:
  178|  22.6k|                                                        int16_t *dst) {      \
  179|  22.6k|    subtract_average_##arch(src, dst, width, height, round_offset,           \
  180|  22.6k|                            num_pel_log2);                                   \
  181|  22.6k|  }
cfl_subtract_average_32x32_avx2:
  178|  37.4k|                                                        int16_t *dst) {      \
  179|  37.4k|    subtract_average_##arch(src, dst, width, height, round_offset,           \
  180|  37.4k|                            num_pel_log2);                                   \
  181|  37.4k|  }

decodeframe.c:get_unsigned_bits:
   46|   532k|static inline int get_unsigned_bits(unsigned int num_values) {
   47|   532k|  return num_values > 0 ? get_msb(num_values) + 1 : 0;
  ------------------
  |  Branch (47:10): [True: 451k, False: 80.8k]
  ------------------
   48|   532k|}
detokenize.c:get_unsigned_bits:
   46|   161k|static inline int get_unsigned_bits(unsigned int num_values) {
   47|  18.4E|  return num_values > 0 ? get_msb(num_values) + 1 : 0;
  ------------------
  |  Branch (47:10): [True: 161k, False: 18.4E]
  ------------------
   48|   161k|}

av1_convolve_2d_sr_intrabc_c:
  197|  2.74k|                                  ConvolveParams *conv_params) {
  198|  2.74k|  assert(subpel_x_qn == 8);
  199|  2.74k|  assert(subpel_y_qn == 8);
  200|  2.74k|  assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
  201|  2.74k|  assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
  202|  2.74k|  (void)filter_params_x;
  203|  2.74k|  (void)subpel_x_qn;
  204|  2.74k|  (void)filter_params_y;
  205|  2.74k|  (void)subpel_y_qn;
  206|  2.74k|  (void)conv_params;
  207|       |
  208|  2.74k|  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
  209|  2.74k|  int im_h = h + 1;
  210|  2.74k|  int im_stride = w;
  211|  2.74k|  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
  212|  2.74k|  const int bd = 8;
  213|       |
  214|       |  // horizontal filter
  215|       |  // explicitly operate for subpel_x_qn = 8.
  216|  2.74k|  int16_t *im = im_block;
  217|  26.8k|  for (int y = 0; y < im_h; ++y) {
  ------------------
  |  Branch (217:19): [True: 24.1k, False: 2.74k]
  ------------------
  218|   331k|    for (int x = 0; x < w; ++x) {
  ------------------
  |  Branch (218:21): [True: 307k, False: 24.1k]
  ------------------
  219|   307k|      const int32_t sum = (1 << bd) + src[x] + src[x + 1];
  220|   307k|      assert(0 <= sum && sum < (1 << (bd + 2)));
  221|   307k|      im[x] = sum;
  222|   307k|    }
  223|  24.1k|    src += src_stride;
  224|  24.1k|    im += im_stride;
  225|  24.1k|  }
  226|       |
  227|       |  // vertical filter
  228|       |  // explicitly operate for subpel_y_qn = 8.
  229|  2.74k|  int16_t *src_vert = im_block;
  230|  24.1k|  for (int y = 0; y < h; ++y) {
  ------------------
  |  Branch (230:19): [True: 21.3k, False: 2.74k]
  ------------------
  231|   307k|    for (int x = 0; x < w; ++x) {
  ------------------
  |  Branch (231:21): [True: 285k, False: 21.3k]
  ------------------
  232|   285k|      const int32_t sum =
  233|   285k|          (1 << (bd + 2)) + src_vert[x] + src_vert[im_stride + x];
  234|   285k|      assert(0 <= sum && sum < (1 << (bd + 4)));
  235|   285k|      const int16_t res =
  236|   285k|          ROUND_POWER_OF_TWO(sum, 2) - ((1 << bd) + (1 << (bd - 1)));
  ------------------
  |  |   41|   285k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  237|   285k|      dst[x] = clip_pixel(res);
  238|   285k|    }
  239|  21.3k|    src_vert += im_stride;
  240|  21.3k|    dst += dst_stride;
  241|  21.3k|  }
  242|  2.74k|}
av1_convolve_y_sr_intrabc_c:
  249|  2.98k|                                 const int subpel_y_qn) {
  250|  2.98k|  assert(subpel_y_qn == 8);
  251|  2.98k|  assert(filter_params_y->taps == 2);
  252|  2.98k|  (void)filter_params_y;
  253|  2.98k|  (void)subpel_y_qn;
  254|       |
  255|       |  // vertical filter
  256|       |  // explicitly operate for subpel_y_qn = 8.
  257|  26.3k|  for (int y = 0; y < h; ++y) {
  ------------------
  |  Branch (257:19): [True: 23.4k, False: 2.98k]
  ------------------
  258|   321k|    for (int x = 0; x < w; ++x) {
  ------------------
  |  Branch (258:21): [True: 297k, False: 23.4k]
  ------------------
  259|   297k|      const int32_t res = src[x] + src[src_stride + x];
  260|   297k|      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(res, 1));
  ------------------
  |  |   41|   297k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  261|   297k|    }
  262|  23.4k|    src += src_stride;
  263|  23.4k|    dst += dst_stride;
  264|  23.4k|  }
  265|  2.98k|}
av1_convolve_x_sr_intrabc_c:
  273|  2.93k|                                 ConvolveParams *conv_params) {
  274|  2.93k|  assert(subpel_x_qn == 8);
  275|  2.93k|  assert(filter_params_x->taps == 2);
  276|  2.93k|  assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
  277|  2.93k|  (void)filter_params_x;
  278|  2.93k|  (void)subpel_x_qn;
  279|  2.93k|  (void)conv_params;
  280|       |
  281|       |  // horizontal filter
  282|       |  // explicitly operate for subpel_x_qn = 8.
  283|  25.9k|  for (int y = 0; y < h; ++y) {
  ------------------
  |  Branch (283:19): [True: 23.0k, False: 2.93k]
  ------------------
  284|   329k|    for (int x = 0; x < w; ++x) {
  ------------------
  |  Branch (284:21): [True: 306k, False: 23.0k]
  ------------------
  285|   306k|      const int32_t res = src[x] + src[x + 1];
  286|   306k|      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(res, 1));
  ------------------
  |  |   41|   306k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  287|   306k|    }
  288|  23.0k|    src += src_stride;
  289|  23.0k|    dst += dst_stride;
  290|  23.0k|  }
  291|  2.93k|}
av1_convolve_2d_facade:
  643|  8.14M|                            ConvolveParams *conv_params) {
  644|  8.14M|  (void)x_step_q4;
  645|  8.14M|  (void)y_step_q4;
  646|  8.14M|  (void)dst;
  647|  8.14M|  (void)dst_stride;
  648|       |
  649|  8.14M|  const InterpFilterParams *filter_params_x = interp_filters[0];
  650|  8.14M|  const InterpFilterParams *filter_params_y = interp_filters[1];
  651|       |
  652|       |  // TODO(jingning, yunqing): Add SIMD support to 2-tap filter case.
  653|       |  // 2-tap filter indicates that it is for IntraBC.
  654|  8.14M|  if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
  ------------------
  |  Branch (654:7): [True: 56.2k, False: 8.09M]
  |  Branch (654:37): [True: 18.4E, False: 8.09M]
  ------------------
  655|  56.5k|    assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
  656|  56.5k|    assert(!scaled);
  657|  56.5k|    if (subpel_x_qn && subpel_y_qn) {
  ------------------
  |  Branch (657:9): [True: 5.67k, False: 50.8k]
  |  Branch (657:24): [True: 2.74k, False: 2.93k]
  ------------------
  658|  2.74k|      av1_convolve_2d_sr_intrabc(src, src_stride, dst, dst_stride, w, h,
  ------------------
  |  |  120|  2.74k|#define av1_convolve_2d_sr_intrabc av1_convolve_2d_sr_intrabc_c
  ------------------
  659|  2.74k|                                 filter_params_x, filter_params_y, subpel_x_qn,
  660|  2.74k|                                 subpel_y_qn, conv_params);
  661|  2.74k|      return;
  662|  53.7k|    } else if (subpel_x_qn) {
  ------------------
  |  Branch (662:16): [True: 2.93k, False: 50.8k]
  ------------------
  663|  2.93k|      av1_convolve_x_sr_intrabc(src, src_stride, dst, dst_stride, w, h,
  ------------------
  |  |  132|  2.93k|#define av1_convolve_x_sr_intrabc av1_convolve_x_sr_intrabc_c
  ------------------
  664|  2.93k|                                filter_params_x, subpel_x_qn, conv_params);
  665|  2.93k|      return;
  666|  50.8k|    } else if (subpel_y_qn) {
  ------------------
  |  Branch (666:16): [True: 2.98k, False: 47.8k]
  ------------------
  667|  2.98k|      av1_convolve_y_sr_intrabc(src, src_stride, dst, dst_stride, w, h,
  ------------------
  |  |  140|  2.98k|#define av1_convolve_y_sr_intrabc av1_convolve_y_sr_intrabc_c
  ------------------
  668|  2.98k|                                filter_params_y, subpel_y_qn);
  669|  2.98k|      return;
  670|  2.98k|    }
  671|  56.5k|  }
  672|       |
  673|  8.14M|  if (scaled) {
  ------------------
  |  Branch (673:7): [True: 1.59M, False: 6.54M]
  ------------------
  674|  1.59M|    convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
  675|  1.59M|                              filter_params_x, filter_params_y, subpel_x_qn,
  676|  1.59M|                              x_step_q4, subpel_y_qn, y_step_q4, conv_params);
  677|  6.54M|  } else if (conv_params->is_compound) {
  ------------------
  |  Branch (677:14): [True: 1.44M, False: 5.09M]
  ------------------
  678|  1.44M|    convolve_2d_facade_compound(src, src_stride, dst, dst_stride, w, h,
  679|  1.44M|                                filter_params_x, filter_params_y, subpel_x_qn,
  680|  1.44M|                                subpel_y_qn, conv_params);
  681|  5.09M|  } else {
  682|  5.09M|    convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
  683|  5.09M|                              filter_params_x, filter_params_y, subpel_x_qn,
  684|  5.09M|                              subpel_y_qn, conv_params);
  685|  5.09M|  }
  686|  8.14M|}
av1_highbd_convolve_2d_sr_intrabc_c:
  803|  1.45k|    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
  804|  1.45k|  const int bits =
  805|  1.45k|      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|  1.45k|#define FILTER_BITS 7
  ------------------
  806|  1.45k|  assert(bits >= 0);
  807|  1.45k|  assert(subpel_x_qn == 8);
  808|  1.45k|  assert(subpel_y_qn == 8);
  809|  1.45k|  assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
  810|  1.45k|  assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
  811|  1.45k|  (void)filter_params_x;
  812|  1.45k|  (void)subpel_x_qn;
  813|  1.45k|  (void)filter_params_y;
  814|  1.45k|  (void)subpel_y_qn;
  815|  1.45k|  (void)conv_params;
  816|       |
  817|  1.45k|  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
  818|  1.45k|  int im_h = h + 1;
  819|  1.45k|  int im_stride = w;
  820|  1.45k|  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
  821|       |
  822|       |  // horizontal filter
  823|       |  // explicitly operate for subpel_x_qn = 8.
  824|  1.45k|  int16_t *im = im_block;
  825|  15.1k|  for (int y = 0; y < im_h; ++y) {
  ------------------
  |  Branch (825:19): [True: 13.7k, False: 1.45k]
  ------------------
  826|   147k|    for (int x = 0; x < w; ++x) {
  ------------------
  |  Branch (826:21): [True: 133k, False: 13.7k]
  ------------------
  827|   133k|      int32_t sum = (1 << (bd + FILTER_BITS - 1)) + 64 * (src[x] + src[x + 1]);
  ------------------
  |  |   21|   133k|#define FILTER_BITS 7
  ------------------
  828|   133k|      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
  829|   133k|      sum = ROUND_POWER_OF_TWO(sum, conv_params->round_0);
  ------------------
  |  |   41|   133k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  830|   133k|      im[x] = sum;
  831|   133k|    }
  832|  13.7k|    src += src_stride;
  833|  13.7k|    im += im_stride;
  834|  13.7k|  }
  835|       |
  836|       |  // vertical filter
  837|       |  // explicitly operate for subpel_y_qn = 8.
  838|  1.45k|  int16_t *src_vert = im_block;
  839|  1.45k|  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
  ------------------
  |  |   21|  1.45k|#define FILTER_BITS 7
  ------------------
  840|  13.7k|  for (int y = 0; y < h; ++y) {
  ------------------
  |  Branch (840:19): [True: 12.2k, False: 1.45k]
  ------------------
  841|   134k|    for (int x = 0; x < w; ++x) {
  ------------------
  |  Branch (841:21): [True: 122k, False: 12.2k]
  ------------------
  842|   122k|      const int32_t sum =
  843|   122k|          (1 << offset_bits) + 64 * (src_vert[x] + src_vert[im_stride + x]);
  844|   122k|      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
  845|   122k|      const int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
  ------------------
  |  |   41|   122k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  846|   122k|                          ((1 << (offset_bits - conv_params->round_1)) +
  847|   122k|                           (1 << (offset_bits - conv_params->round_1 - 1)));
  848|       |
  849|   122k|      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
  ------------------
  |  |   41|   122k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  850|   122k|    }
  851|  12.2k|    src_vert += im_stride;
  852|  12.2k|    dst += dst_stride;
  853|  12.2k|  }
  854|  1.45k|}
av1_highbd_convolve_y_sr_intrabc_c:
  861|  1.45k|    int bd) {
  862|  1.45k|  assert(subpel_y_qn == 8);
  863|  1.45k|  assert(filter_params_y->taps == 2);
  864|  1.45k|  (void)filter_params_y;
  865|  1.45k|  (void)subpel_y_qn;
  866|       |
  867|       |  // vertical filter
  868|       |  // explicitly operate for subpel_y_qn = 8.
  869|  14.2k|  for (int y = 0; y < h; ++y) {
  ------------------
  |  Branch (869:19): [True: 12.8k, False: 1.45k]
  ------------------
  870|   206k|    for (int x = 0; x < w; ++x) {
  ------------------
  |  Branch (870:21): [True: 193k, False: 12.8k]
  ------------------
  871|   193k|      const int32_t res = src[x] + src[src_stride + x];
  872|   193k|      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, 1), bd);
  ------------------
  |  |   41|   193k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  873|   193k|    }
  874|  12.8k|    src += src_stride;
  875|  12.8k|    dst += dst_stride;
  876|  12.8k|  }
  877|  1.45k|}
av1_highbd_convolve_x_sr_intrabc_c:
  884|  1.41k|    ConvolveParams *conv_params, int bd) {
  885|  1.41k|  const int bits = FILTER_BITS - conv_params->round_0;
  ------------------
  |  |   21|  1.41k|#define FILTER_BITS 7
  ------------------
  886|  1.41k|  assert(bits >= 0);
  887|  1.41k|  assert(subpel_x_qn == 8);
  888|  1.41k|  assert(filter_params_x->taps == 2);
  889|  1.41k|  assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
  890|  1.41k|  (void)filter_params_x;
  891|  1.41k|  (void)subpel_x_qn;
  892|       |
  893|       |  // horizontal filter
  894|       |  // explicitly operate for subpel_x_qn = 8.
  895|  15.5k|  for (int y = 0; y < h; ++y) {
  ------------------
  |  Branch (895:19): [True: 14.1k, False: 1.41k]
  ------------------
  896|   377k|    for (int x = 0; x < w; ++x) {
  ------------------
  |  Branch (896:21): [True: 363k, False: 14.1k]
  ------------------
  897|   363k|      int32_t res = 64 * (src[x] + src[x + 1]);
  898|   363k|      res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
  ------------------
  |  |   41|   363k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  899|   363k|      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
  ------------------
  |  |   41|   363k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  900|   363k|    }
  901|  14.1k|    src += src_stride;
  902|  14.1k|    dst += dst_stride;
  903|  14.1k|  }
  904|  1.41k|}
av1_highbd_convolve_2d_facade:
 1252|  8.38M|                                   int bd) {
 1253|  8.38M|  (void)x_step_q4;
 1254|  8.38M|  (void)y_step_q4;
 1255|  8.38M|  (void)dst_stride;
 1256|  8.38M|  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  ------------------
  |  |   75|  8.38M|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 1257|       |
 1258|  8.38M|  const InterpFilterParams *filter_params_x = interp_filters[0];
 1259|  8.38M|  const InterpFilterParams *filter_params_y = interp_filters[1];
 1260|       |
 1261|  8.38M|  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  ------------------
  |  |   75|  8.38M|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 1262|       |  // 2-tap filter indicates that it is for IntraBC.
 1263|  8.38M|  if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
  ------------------
  |  Branch (1263:7): [True: 91.2k, False: 8.28M]
  |  Branch (1263:37): [True: 18.4E, False: 8.28M]
  ------------------
 1264|  91.2k|    assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
 1265|  91.2k|    assert(!scaled);
 1266|  91.2k|    if (subpel_x_qn && subpel_y_qn) {
  ------------------
  |  Branch (1266:9): [True: 2.86k, False: 88.4k]
  |  Branch (1266:24): [True: 1.45k, False: 1.41k]
  ------------------
 1267|  1.45k|      av1_highbd_convolve_2d_sr_intrabc_c(
 1268|  1.45k|          src, src_stride, dst, dst_stride, w, h, filter_params_x,
 1269|  1.45k|          filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
 1270|  1.45k|      return;
 1271|  89.8k|    } else if (subpel_x_qn) {
  ------------------
  |  Branch (1271:16): [True: 1.41k, False: 88.4k]
  ------------------
 1272|  1.41k|      av1_highbd_convolve_x_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
 1273|  1.41k|                                         filter_params_x, subpel_x_qn,
 1274|  1.41k|                                         conv_params, bd);
 1275|  1.41k|      return;
 1276|  88.4k|    } else if (subpel_y_qn) {
  ------------------
  |  Branch (1276:16): [True: 1.45k, False: 86.9k]
  ------------------
 1277|  1.45k|      av1_highbd_convolve_y_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
 1278|  1.45k|                                         filter_params_y, subpel_y_qn, bd);
 1279|  1.45k|      return;
 1280|  1.45k|    }
 1281|  91.2k|  }
 1282|       |
 1283|  8.37M|  if (scaled) {
  ------------------
  |  Branch (1283:7): [True: 724k, False: 7.65M]
  ------------------
 1284|   724k|    if (conv_params->is_compound) {
  ------------------
  |  Branch (1284:9): [True: 117k, False: 606k]
  ------------------
 1285|   117k|      assert(conv_params->dst != NULL);
 1286|   117k|    }
 1287|   724k|    av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
 1288|   724k|                                 filter_params_x, filter_params_y, subpel_x_qn,
 1289|   724k|                                 x_step_q4, subpel_y_qn, y_step_q4, conv_params,
 1290|   724k|                                 bd);
 1291|  7.65M|  } else if (conv_params->is_compound) {
  ------------------
  |  Branch (1291:14): [True: 1.51M, False: 6.13M]
  ------------------
 1292|  1.51M|    highbd_convolve_2d_facade_compound(
 1293|  1.51M|        src, src_stride, dst, dst_stride, w, h, filter_params_x,
 1294|  1.51M|        filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
 1295|  6.13M|  } else {
 1296|  6.13M|    highbd_convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
 1297|  6.13M|                                     filter_params_x, filter_params_y,
 1298|  6.13M|                                     subpel_x_qn, subpel_y_qn, conv_params, bd);
 1299|  6.13M|  }
 1300|  8.37M|}
convolve.c:convolve_2d_scale_wrapper:
  583|  1.59M|    ConvolveParams *conv_params) {
  584|  1.59M|  if (conv_params->is_compound) {
  ------------------
  |  Branch (584:7): [True: 261k, False: 1.33M]
  ------------------
  585|   261k|    assert(conv_params->dst != NULL);
  586|   261k|  }
  587|  1.59M|  av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x,
  588|  1.59M|                        filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn,
  589|  1.59M|                        y_step_qn, conv_params);
  590|  1.59M|}
convolve.c:convolve_2d_facade_compound:
  596|  1.44M|    const int subpel_y_qn, ConvolveParams *conv_params) {
  597|  1.44M|  const bool need_x = subpel_x_qn != 0;
  598|  1.44M|  const bool need_y = subpel_y_qn != 0;
  599|  1.44M|  if (!need_x && !need_y) {
  ------------------
  |  Branch (599:7): [True: 1.05M, False: 391k]
  |  Branch (599:18): [True: 975k, False: 79.3k]
  ------------------
  600|   975k|    av1_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
  601|   975k|                                  conv_params);
  602|   975k|  } else if (need_x && !need_y) {
  ------------------
  |  Branch (602:14): [True: 391k, False: 79.1k]
  |  Branch (602:24): [True: 153k, False: 238k]
  ------------------
  603|   153k|    av1_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
  604|   153k|                            filter_params_x, subpel_x_qn, conv_params);
  605|   317k|  } else if (!need_x && need_y) {
  ------------------
  |  Branch (605:14): [True: 79.3k, False: 237k]
  |  Branch (605:25): [True: 79.3k, False: 1]
  ------------------
  606|  79.3k|    av1_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
  607|  79.3k|                            filter_params_y, subpel_y_qn, conv_params);
  608|   237k|  } else {
  609|   237k|    assert(need_y && need_x);
  610|   238k|    av1_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
  611|   238k|                             filter_params_x, filter_params_y, subpel_x_qn,
  612|   238k|                             subpel_y_qn, conv_params);
  613|   238k|  }
  614|  1.44M|}
convolve.c:convolve_2d_facade_single:
  620|  5.10M|    const int subpel_y_qn, ConvolveParams *conv_params) {
  621|  5.10M|  const bool need_x = subpel_x_qn != 0;
  622|  5.10M|  const bool need_y = subpel_y_qn != 0;
  623|  5.10M|  if (!need_x && !need_y) {
  ------------------
  |  Branch (623:7): [True: 2.45M, False: 2.64M]
  |  Branch (623:18): [True: 1.70M, False: 747k]
  ------------------
  624|  1.70M|    aom_convolve_copy(src, src_stride, dst, dst_stride, w, h);
  625|  3.39M|  } else if (need_x && !need_y) {
  ------------------
  |  Branch (625:14): [True: 2.64M, False: 747k]
  |  Branch (625:24): [True: 828k, False: 1.82M]
  ------------------
  626|   828k|    av1_convolve_x_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
  627|   828k|                      subpel_x_qn, conv_params);
  628|  2.56M|  } else if (!need_x && need_y) {
  ------------------
  |  Branch (628:14): [True: 747k, False: 1.82M]
  |  Branch (628:25): [True: 747k, False: 18.4E]
  ------------------
  629|   747k|    av1_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, filter_params_y,
  630|   747k|                      subpel_y_qn);
  631|  1.82M|  } else {
  632|  1.82M|    assert(need_x && need_y);
  633|  1.82M|    av1_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
  634|  1.82M|                       filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
  635|  1.82M|  }
  636|  5.10M|}
convolve.c:highbd_convolve_2d_facade_compound:
 1200|  1.51M|    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
 1201|  1.51M|  const bool need_x = subpel_x_qn != 0;
 1202|  1.51M|  const bool need_y = subpel_y_qn != 0;
 1203|  1.51M|  if (!need_x && !need_y) {
  ------------------
  |  Branch (1203:7): [True: 466k, False: 1.04M]
  |  Branch (1203:18): [True: 312k, False: 154k]
  ------------------
 1204|   312k|    av1_highbd_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
 1205|   312k|                                         conv_params, bd);
 1206|  1.20M|  } else if (need_x && !need_y) {
  ------------------
  |  Branch (1206:14): [True: 1.04M, False: 154k]
  |  Branch (1206:24): [True: 282k, False: 764k]
  ------------------
 1207|   282k|    av1_highbd_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
 1208|   282k|                                   filter_params_x, subpel_x_qn, conv_params,
 1209|   282k|                                   bd);
 1210|   918k|  } else if (!need_x && need_y) {
  ------------------
  |  Branch (1210:14): [True: 154k, False: 764k]
  |  Branch (1210:25): [True: 154k, False: 0]
  ------------------
 1211|   154k|    av1_highbd_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
 1212|   154k|                                   filter_params_y, subpel_y_qn, conv_params,
 1213|   154k|                                   bd);
 1214|   764k|  } else {
 1215|   764k|    assert(need_x && need_y);
 1216|   764k|    av1_highbd_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
 1217|   764k|                                    filter_params_x, filter_params_y,
 1218|   764k|                                    subpel_x_qn, subpel_y_qn, conv_params, bd);
 1219|   764k|  }
 1220|  1.51M|}
convolve.c:highbd_convolve_2d_facade_single:
 1226|  6.13M|    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
 1227|  6.13M|  const bool need_x = subpel_x_qn != 0;
 1228|  6.13M|  const bool need_y = subpel_y_qn != 0;
 1229|       |
 1230|  6.13M|  if (!need_x && !need_y) {
  ------------------
  |  Branch (1230:7): [True: 2.25M, False: 3.88M]
  |  Branch (1230:18): [True: 1.23M, False: 1.02M]
  ------------------
 1231|  1.23M|    aom_highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h);
 1232|  4.90M|  } else if (need_x && !need_y) {
  ------------------
  |  Branch (1232:14): [True: 3.88M, False: 1.02M]
  |  Branch (1232:24): [True: 890k, False: 2.99M]
  ------------------
 1233|   890k|    av1_highbd_convolve_x_sr(src, src_stride, dst, dst_stride, w, h,
 1234|   890k|                             filter_params_x, subpel_x_qn, conv_params, bd);
 1235|  4.01M|  } else if (!need_x && need_y) {
  ------------------
  |  Branch (1235:14): [True: 1.02M, False: 2.99M]
  |  Branch (1235:25): [True: 1.02M, False: 0]
  ------------------
 1236|  1.02M|    av1_highbd_convolve_y_sr(src, src_stride, dst, dst_stride, w, h,
 1237|  1.02M|                             filter_params_y, subpel_y_qn, bd);
 1238|  2.99M|  } else {
 1239|  2.99M|    assert(need_x && need_y);
 1240|  2.99M|    av1_highbd_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h,
 1241|  2.99M|                              filter_params_x, filter_params_y, subpel_x_qn,
 1242|  2.99M|                              subpel_y_qn, conv_params, bd);
 1243|  2.99M|  }
 1244|  6.13M|}

decodeframe.c:get_conv_params_no_round:
   71|  17.0M|                                                      int is_compound, int bd) {
   72|  17.0M|  ConvolveParams conv_params;
   73|  17.0M|  assert(IMPLIES(cmp_index, is_compound));
   74|       |
   75|  17.0M|  conv_params.is_compound = is_compound;
   76|  17.0M|  conv_params.use_dist_wtd_comp_avg = 0;
   77|  17.0M|  conv_params.round_0 = ROUND0_BITS;
  ------------------
  |  |   39|  17.0M|#define ROUND0_BITS 3
  ------------------
   78|  17.0M|  conv_params.round_1 = is_compound ? COMPOUND_ROUND1_BITS
  ------------------
  |  |   40|  3.36M|#define COMPOUND_ROUND1_BITS 7
  ------------------
  |  Branch (78:25): [True: 3.36M, False: 13.6M]
  ------------------
   79|  17.0M|                                    : 2 * FILTER_BITS - conv_params.round_0;
  ------------------
  |  |   21|  13.6M|#define FILTER_BITS 7
  ------------------
   80|  17.0M|#if CONFIG_AV1_HIGHBITDEPTH
   81|  17.0M|  const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2;
  ------------------
  |  |   21|  17.0M|#define FILTER_BITS 7
  ------------------
   82|  17.0M|  assert(IMPLIES(bd < 12, intbufrange <= 16));
   83|  17.0M|  if (intbufrange > 16) {
  ------------------
  |  Branch (83:7): [True: 315k, False: 16.7M]
  ------------------
   84|   315k|    conv_params.round_0 += intbufrange - 16;
   85|   315k|    if (!is_compound) conv_params.round_1 -= intbufrange - 16;
  ------------------
  |  Branch (85:9): [True: 291k, False: 23.9k]
  ------------------
   86|   315k|  }
   87|       |#else
   88|       |  (void)bd;
   89|       |#endif  // CONFIG_AV1_HIGHBITDEPTH
   90|       |  // TODO(yunqing): The following dst should only be valid while
   91|       |  // is_compound = 1;
   92|  17.0M|  conv_params.dst = dst;
   93|  17.0M|  conv_params.dst_stride = dst_stride;
   94|  17.0M|  conv_params.plane = plane;
   95|       |
   96|       |  // By default, set do average to 1 if this is the second single prediction
   97|       |  // in a compound mode.
   98|  17.0M|  conv_params.do_average = cmp_index;
   99|  17.0M|  return conv_params;
  100|  17.0M|}
restoration.c:get_conv_params_wiener:
  107|  99.7k|static inline WienerConvolveParams get_conv_params_wiener(int bd) {
  108|  99.7k|  WienerConvolveParams conv_params;
  109|  99.7k|  conv_params.round_0 = WIENER_ROUND0_BITS;
  ------------------
  |  |   41|  99.7k|#define WIENER_ROUND0_BITS 3
  ------------------
  110|  99.7k|  conv_params.round_1 = 2 * FILTER_BITS - conv_params.round_0;
  ------------------
  |  |   21|  99.7k|#define FILTER_BITS 7
  ------------------
  111|  99.7k|  const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2;
  ------------------
  |  |   21|  99.7k|#define FILTER_BITS 7
  ------------------
  112|  99.7k|  assert(IMPLIES(bd < 12, intbufrange <= 16));
  113|  99.7k|  if (intbufrange > 16) {
  ------------------
  |  Branch (113:7): [True: 501, False: 99.2k]
  ------------------
  114|    501|    conv_params.round_0 += intbufrange - 16;
  115|    501|    conv_params.round_1 -= intbufrange - 16;
  116|    501|  }
  117|  99.7k|  return conv_params;
  118|  99.7k|}

av1_default_coef_probs:
   31|   150k|void av1_default_coef_probs(AV1_COMMON *cm) {
   32|   150k|  const int index = get_q_ctx(cm->quant_params.base_qindex);
   33|       |#if CONFIG_ENTROPY_STATS
   34|       |  cm->coef_cdf_category = index;
   35|       |#endif
   36|       |
   37|   150k|  av1_copy(cm->fc->txb_skip_cdf, av1_default_txb_skip_cdfs[index]);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
   38|   150k|  av1_copy(cm->fc->eob_extra_cdf, av1_default_eob_extra_cdfs[index]);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
   39|   150k|  av1_copy(cm->fc->dc_sign_cdf, av1_default_dc_sign_cdfs[index]);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
   40|   150k|  av1_copy(cm->fc->coeff_br_cdf, av1_default_coeff_lps_multi_cdfs[index]);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
   41|   150k|  av1_copy(cm->fc->coeff_base_cdf, av1_default_coeff_base_multi_cdfs[index]);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
   42|   150k|  av1_copy(cm->fc->coeff_base_eob_cdf,
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
   43|   150k|           av1_default_coeff_base_eob_multi_cdfs[index]);
   44|   150k|  av1_copy(cm->fc->eob_flag_cdf16, av1_default_eob_multi16_cdfs[index]);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
   45|   150k|  av1_copy(cm->fc->eob_flag_cdf32, av1_default_eob_multi32_cdfs[index]);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
   46|   150k|  av1_copy(cm->fc->eob_flag_cdf64, av1_default_eob_multi64_cdfs[index]);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
   47|   150k|  av1_copy(cm->fc->eob_flag_cdf128, av1_default_eob_multi128_cdfs[index]);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
   48|   150k|  av1_copy(cm->fc->eob_flag_cdf256, av1_default_eob_multi256_cdfs[index]);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
   49|   150k|  av1_copy(cm->fc->eob_flag_cdf512, av1_default_eob_multi512_cdfs[index]);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
   50|   150k|  av1_copy(cm->fc->eob_flag_cdf1024, av1_default_eob_multi1024_cdfs[index]);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
   51|   150k|}
av1_reset_cdf_symbol_counters:
   85|  27.5k|void av1_reset_cdf_symbol_counters(FRAME_CONTEXT *fc) {
   86|  27.5k|  RESET_CDF_COUNTER(fc->txb_skip_cdf, 2);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   87|  27.5k|  RESET_CDF_COUNTER(fc->eob_extra_cdf, 2);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   88|  27.5k|  RESET_CDF_COUNTER(fc->dc_sign_cdf, 2);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   89|  27.5k|  RESET_CDF_COUNTER(fc->eob_flag_cdf16, 5);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   90|  27.5k|  RESET_CDF_COUNTER(fc->eob_flag_cdf32, 6);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   91|  27.5k|  RESET_CDF_COUNTER(fc->eob_flag_cdf64, 7);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   92|  27.5k|  RESET_CDF_COUNTER(fc->eob_flag_cdf128, 8);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   93|  27.5k|  RESET_CDF_COUNTER(fc->eob_flag_cdf256, 9);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   94|  27.5k|  RESET_CDF_COUNTER(fc->eob_flag_cdf512, 10);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   95|  27.5k|  RESET_CDF_COUNTER(fc->eob_flag_cdf1024, 11);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   96|  27.5k|  RESET_CDF_COUNTER(fc->coeff_base_eob_cdf, 3);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   97|  27.5k|  RESET_CDF_COUNTER(fc->coeff_base_cdf, 4);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   98|  27.5k|  RESET_CDF_COUNTER(fc->coeff_br_cdf, BR_CDF_SIZE);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   99|  27.5k|  RESET_CDF_COUNTER(fc->newmv_cdf, 2);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  100|  27.5k|  RESET_CDF_COUNTER(fc->zeromv_cdf, 2);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  101|  27.5k|  RESET_CDF_COUNTER(fc->refmv_cdf, 2);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  102|  27.5k|  RESET_CDF_COUNTER(fc->drl_cdf, 2);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  103|  27.5k|  RESET_CDF_COUNTER(fc->inter_compound_mode_cdf, INTER_COMPOUND_MODES);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  104|  27.5k|  RESET_CDF_COUNTER(fc->compound_type_cdf, MASKED_COMPOUND_TYPES);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  105|  27.5k|  RESET_CDF_COUNTER(fc->wedge_idx_cdf, 16);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  106|  27.5k|  RESET_CDF_COUNTER(fc->interintra_cdf, 2);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  107|  27.5k|  RESET_CDF_COUNTER(fc->wedge_interintra_cdf, 2);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  108|  27.5k|  RESET_CDF_COUNTER(fc->interintra_mode_cdf, INTERINTRA_MODES);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  109|  27.5k|  RESET_CDF_COUNTER(fc->motion_mode_cdf, MOTION_MODES);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  110|  27.5k|  RESET_CDF_COUNTER(fc->obmc_cdf, 2);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  111|  27.5k|  RESET_CDF_COUNTER(fc->palette_y_size_cdf, PALETTE_SIZES);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  112|  27.5k|  RESET_CDF_COUNTER(fc->palette_uv_size_cdf, PALETTE_SIZES);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  113|   220k|  for (int j = 0; j < PALETTE_SIZES; j++) {
  ------------------
  |  Branch (113:19): [True: 193k, False: 27.5k]
  ------------------
  114|   193k|    int nsymbs = j + PALETTE_MIN_SIZE;
  ------------------
  |  |   65|   193k|#define PALETTE_MIN_SIZE 2
  ------------------
  115|   193k|    RESET_CDF_COUNTER_STRIDE(fc->palette_y_color_index_cdf[j], nsymbs,
  ------------------
  |  |   64|   193k|  do {                                                               \
  |  |   65|   193k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |   66|   193k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |   67|   193k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |   68|   193k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |   69|   193k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  116|   193k|                             CDF_SIZE(PALETTE_COLORS));
  117|   193k|    RESET_CDF_COUNTER_STRIDE(fc->palette_uv_color_index_cdf[j], nsymbs,
  ------------------
  |  |   64|   193k|  do {                                                               \
  |  |   65|   193k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |   66|   193k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |   67|   193k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |   68|   193k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |   69|   193k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  118|   193k|                             CDF_SIZE(PALETTE_COLORS));
  119|   193k|  }
  120|  27.5k|  RESET_CDF_COUNTER(fc->palette_y_mode_cdf, 2);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  121|  27.5k|  RESET_CDF_COUNTER(fc->palette_uv_mode_cdf, 2);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  122|  27.5k|  RESET_CDF_COUNTER(fc->comp_inter_cdf, 2);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  123|  27.5k|  RESET_CDF_COUNTER(fc->single_ref_cdf, 2);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  124|  27.5k|  RESET_CDF_COUNTER(fc->comp_ref_type_cdf, 2);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  125|  27.5k|  RESET_CDF_COUNTER(fc->uni_comp_ref_cdf, 2);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  126|  27.5k|  RESET_CDF_COUNTER(fc->comp_ref_cdf, 2);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  127|  27.5k|  RESET_CDF_COUNTER(fc->comp_bwdref_cdf, 2);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  128|  27.5k|  RESET_CDF_COUNTER(fc->txfm_partition_cdf, 2);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  129|  27.5k|  RESET_CDF_COUNTER(fc->compound_index_cdf, 2);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  130|  27.5k|  RESET_CDF_COUNTER(fc->comp_group_idx_cdf, 2);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  131|  27.5k|  RESET_CDF_COUNTER(fc->skip_mode_cdfs, 2);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  132|  27.5k|  RESET_CDF_COUNTER(fc->skip_txfm_cdfs, 2);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  133|  27.5k|  RESET_CDF_COUNTER(fc->intra_inter_cdf, 2);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  134|  27.5k|  reset_nmv_counter(&fc->nmvc);
  135|  27.5k|  reset_nmv_counter(&fc->ndvc);
  136|  27.5k|  RESET_CDF_COUNTER(fc->intrabc_cdf, 2);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  137|  27.5k|  RESET_CDF_COUNTER(fc->seg.pred_cdf, 2);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  138|  27.5k|  RESET_CDF_COUNTER(fc->seg.spatial_pred_seg_cdf, MAX_SEGMENTS);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  139|  27.5k|  RESET_CDF_COUNTER(fc->filter_intra_cdfs, 2);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  140|  27.5k|  RESET_CDF_COUNTER(fc->filter_intra_mode_cdf, FILTER_INTRA_MODES);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  141|  27.5k|  RESET_CDF_COUNTER(fc->switchable_restore_cdf, RESTORE_SWITCHABLE_TYPES);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  142|  27.5k|  RESET_CDF_COUNTER(fc->wiener_restore_cdf, 2);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  143|  27.5k|  RESET_CDF_COUNTER(fc->sgrproj_restore_cdf, 2);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  144|  27.5k|  RESET_CDF_COUNTER(fc->y_mode_cdf, INTRA_MODES);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  145|  27.5k|  RESET_CDF_COUNTER_STRIDE(fc->uv_mode_cdf[0], UV_INTRA_MODES - 1,
  ------------------
  |  |   64|  27.5k|  do {                                                               \
  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |   69|  27.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  146|  27.5k|                           CDF_SIZE(UV_INTRA_MODES));
  147|  27.5k|  RESET_CDF_COUNTER(fc->uv_mode_cdf[1], UV_INTRA_MODES);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  148|   579k|  for (int i = 0; i < PARTITION_CONTEXTS; i++) {
  ------------------
  |  |  171|   579k|#define PARTITION_CONTEXTS (PARTITION_BLOCK_SIZES * PARTITION_PLOFFSET)
  |  |  ------------------
  |  |  |  |  170|   579k|#define PARTITION_BLOCK_SIZES 5
  |  |  ------------------
  |  |               #define PARTITION_CONTEXTS (PARTITION_BLOCK_SIZES * PARTITION_PLOFFSET)
  |  |  ------------------
  |  |  |  |  169|   579k|#define PARTITION_PLOFFSET 4  // number of probability models per block size
  |  |  ------------------
  ------------------
  |  Branch (148:19): [True: 551k, False: 27.5k]
  ------------------
  149|   551k|    if (i < 4) {
  ------------------
  |  Branch (149:9): [True: 110k, False: 441k]
  ------------------
  150|   110k|      RESET_CDF_COUNTER_STRIDE(fc->partition_cdf[i], 4, CDF_SIZE(10));
  ------------------
  |  |   64|   110k|  do {                                                               \
  |  |   65|   110k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |   66|   110k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |   67|   110k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |   68|   110k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |   69|   110k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  151|   441k|    } else if (i < 16) {
  ------------------
  |  Branch (151:16): [True: 330k, False: 110k]
  ------------------
  152|   330k|      RESET_CDF_COUNTER(fc->partition_cdf[i], 10);
  ------------------
  |  |   61|   330k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|   330k|  do {                                                               \
  |  |  |  |   65|   330k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|   330k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|   330k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|   330k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|   330k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  153|   330k|    } else {
  154|   110k|      RESET_CDF_COUNTER_STRIDE(fc->partition_cdf[i], 8, CDF_SIZE(10));
  ------------------
  |  |   64|   110k|  do {                                                               \
  |  |   65|   110k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |   66|   110k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |   67|   110k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |   68|   110k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |   69|   110k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  155|   110k|    }
  156|   551k|  }
  157|  27.5k|  RESET_CDF_COUNTER(fc->switchable_interp_cdf, SWITCHABLE_FILTERS);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  158|  27.5k|  RESET_CDF_COUNTER(fc->kf_y_cdf, INTRA_MODES);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  159|  27.5k|  RESET_CDF_COUNTER(fc->angle_delta_cdf, 2 * MAX_ANGLE_DELTA + 1);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  160|  27.5k|  RESET_CDF_COUNTER_STRIDE(fc->tx_size_cdf[0], MAX_TX_DEPTH,
  ------------------
  |  |   64|  27.5k|  do {                                                               \
  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |   69|  27.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  161|  27.5k|                           CDF_SIZE(MAX_TX_DEPTH + 1));
  162|  27.5k|  RESET_CDF_COUNTER(fc->tx_size_cdf[1], MAX_TX_DEPTH + 1);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  163|  27.5k|  RESET_CDF_COUNTER(fc->tx_size_cdf[2], MAX_TX_DEPTH + 1);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  164|  27.5k|  RESET_CDF_COUNTER(fc->tx_size_cdf[3], MAX_TX_DEPTH + 1);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  165|  27.5k|  RESET_CDF_COUNTER(fc->delta_q_cdf, DELTA_Q_PROBS + 1);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  166|  27.5k|  RESET_CDF_COUNTER(fc->delta_lf_cdf, DELTA_LF_PROBS + 1);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  167|   137k|  for (int i = 0; i < FRAME_LF_COUNT; i++) {
  ------------------
  |  |   72|   137k|#define FRAME_LF_COUNT 4
  ------------------
  |  Branch (167:19): [True: 110k, False: 27.5k]
  ------------------
  168|   110k|    RESET_CDF_COUNTER(fc->delta_lf_multi_cdf[i], DELTA_LF_PROBS + 1);
  ------------------
  |  |   61|   110k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|   110k|  do {                                                               \
  |  |  |  |   65|   110k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|   110k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|   110k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|   110k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|   110k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  169|   110k|  }
  170|  27.5k|  RESET_CDF_COUNTER_STRIDE(fc->intra_ext_tx_cdf[1], 7, CDF_SIZE(TX_TYPES));
  ------------------
  |  |   64|  27.5k|  do {                                                               \
  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |   69|  27.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  171|  27.5k|  RESET_CDF_COUNTER_STRIDE(fc->intra_ext_tx_cdf[2], 5, CDF_SIZE(TX_TYPES));
  ------------------
  |  |   64|  27.5k|  do {                                                               \
  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |   69|  27.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  172|  27.5k|  RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[1], 16, CDF_SIZE(TX_TYPES));
  ------------------
  |  |   64|  27.5k|  do {                                                               \
  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |   69|  27.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  173|  27.5k|  RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[2], 12, CDF_SIZE(TX_TYPES));
  ------------------
  |  |   64|  27.5k|  do {                                                               \
  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |   69|  27.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  174|  27.5k|  RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[3], 2, CDF_SIZE(TX_TYPES));
  ------------------
  |  |   64|  27.5k|  do {                                                               \
  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |   69|  27.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  175|  27.5k|  RESET_CDF_COUNTER(fc->cfl_sign_cdf, CFL_JOINT_SIGNS);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  176|  27.5k|  RESET_CDF_COUNTER(fc->cfl_alpha_cdf, CFL_ALPHABET_SIZE);
  ------------------
  |  |   61|  27.5k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  27.5k|  do {                                                               \
  |  |  |  |   65|  27.5k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  27.5k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  27.5k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  27.5k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  177|  27.5k|}
entropy.c:get_q_ctx:
   24|   150k|static int get_q_ctx(int q) {
   25|   150k|  if (q <= 20) return 0;
  ------------------
  |  Branch (25:7): [True: 68.3k, False: 81.8k]
  ------------------
   26|  81.8k|  if (q <= 60) return 1;
  ------------------
  |  Branch (26:7): [True: 37.5k, False: 44.3k]
  ------------------
   27|  44.3k|  if (q <= 120) return 2;
  ------------------
  |  Branch (27:7): [True: 7.24k, False: 37.0k]
  ------------------
   28|  37.0k|  return 3;
   29|  44.3k|}
entropy.c:reset_cdf_symbol_counter:
   54|  3.86M|                                            int cdf_stride, int nsymbs) {
   55|  46.6M|  for (int i = 0; i < num_cdfs; i++) {
  ------------------
  |  Branch (55:19): [True: 42.7M, False: 3.86M]
  ------------------
   56|  42.7M|    cdf_ptr[i * cdf_stride + nsymbs] = 0;
   57|  42.7M|  }
   58|  3.86M|}
entropy.c:reset_nmv_counter:
   71|  55.1k|static inline void reset_nmv_counter(nmv_context *nmv) {
   72|  55.1k|  RESET_CDF_COUNTER(nmv->joints_cdf, 4);
  ------------------
  |  |   61|  55.1k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|  55.1k|  do {                                                               \
  |  |  |  |   65|  55.1k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|  55.1k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|  55.1k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|  55.1k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|  55.1k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   73|   165k|  for (int i = 0; i < 2; i++) {
  ------------------
  |  Branch (73:19): [True: 110k, False: 55.1k]
  ------------------
   74|   110k|    RESET_CDF_COUNTER(nmv->comps[i].classes_cdf, MV_CLASSES);
  ------------------
  |  |   61|   110k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|   110k|  do {                                                               \
  |  |  |  |   65|   110k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|   110k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|   110k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|   110k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|   110k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   75|   110k|    RESET_CDF_COUNTER(nmv->comps[i].class0_fp_cdf, MV_FP_SIZE);
  ------------------
  |  |   61|   110k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|   110k|  do {                                                               \
  |  |  |  |   65|   110k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|   110k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|   110k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|   110k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|   110k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   76|   110k|    RESET_CDF_COUNTER(nmv->comps[i].fp_cdf, MV_FP_SIZE);
  ------------------
  |  |   61|   110k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|   110k|  do {                                                               \
  |  |  |  |   65|   110k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|   110k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|   110k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|   110k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|   110k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   77|   110k|    RESET_CDF_COUNTER(nmv->comps[i].sign_cdf, 2);
  ------------------
  |  |   61|   110k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|   110k|  do {                                                               \
  |  |  |  |   65|   110k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|   110k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|   110k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|   110k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|   110k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   78|   110k|    RESET_CDF_COUNTER(nmv->comps[i].class0_hp_cdf, 2);
  ------------------
  |  |   61|   110k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|   110k|  do {                                                               \
  |  |  |  |   65|   110k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|   110k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|   110k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|   110k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|   110k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   79|   110k|    RESET_CDF_COUNTER(nmv->comps[i].hp_cdf, 2);
  ------------------
  |  |   61|   110k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|   110k|  do {                                                               \
  |  |  |  |   65|   110k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|   110k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|   110k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|   110k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|   110k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   80|   110k|    RESET_CDF_COUNTER(nmv->comps[i].class0_cdf, CLASS0_SIZE);
  ------------------
  |  |   61|   110k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|   110k|  do {                                                               \
  |  |  |  |   65|   110k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|   110k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|   110k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|   110k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|   110k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   81|   110k|    RESET_CDF_COUNTER(nmv->comps[i].bits_cdf, 2);
  ------------------
  |  |   61|   110k|  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
  |  |  ------------------
  |  |  |  |   64|   110k|  do {                                                               \
  |  |  |  |   65|   110k|    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
  |  |  |  |   66|   110k|    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
  |  |  |  |   67|   110k|    int num_cdfs = array_size / cdf_stride;                          \
  |  |  |  |   68|   110k|    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
  |  |  |  |   69|   110k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (69:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   82|   110k|  }
   83|  55.1k|}

decodetxb.c:get_entropy_context:
   88|  20.7M|                                      const ENTROPY_CONTEXT *l) {
   89|  20.7M|  ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
   90|       |
   91|  20.7M|  switch (tx_size) {
   92|  5.87M|    case TX_4X4:
  ------------------
  |  Branch (92:5): [True: 5.87M, False: 14.8M]
  ------------------
   93|  5.87M|      above_ec = a[0] != 0;
   94|  5.87M|      left_ec = l[0] != 0;
   95|  5.87M|      break;
   96|  1.19M|    case TX_4X8:
  ------------------
  |  Branch (96:5): [True: 1.19M, False: 19.5M]
  ------------------
   97|  1.19M|      above_ec = a[0] != 0;
   98|  1.19M|      left_ec = !!*(const uint16_t *)l;
   99|  1.19M|      break;
  100|  2.02M|    case TX_8X4:
  ------------------
  |  Branch (100:5): [True: 2.02M, False: 18.7M]
  ------------------
  101|  2.02M|      above_ec = !!*(const uint16_t *)a;
  102|  2.02M|      left_ec = l[0] != 0;
  103|  2.02M|      break;
  104|   576k|    case TX_8X16:
  ------------------
  |  Branch (104:5): [True: 576k, False: 20.1M]
  ------------------
  105|   576k|      above_ec = !!*(const uint16_t *)a;
  106|   576k|      left_ec = !!*(const uint32_t *)l;
  107|   576k|      break;
  108|  1.11M|    case TX_16X8:
  ------------------
  |  Branch (108:5): [True: 1.11M, False: 19.6M]
  ------------------
  109|  1.11M|      above_ec = !!*(const uint32_t *)a;
  110|  1.11M|      left_ec = !!*(const uint16_t *)l;
  111|  1.11M|      break;
  112|   133k|    case TX_16X32:
  ------------------
  |  Branch (112:5): [True: 133k, False: 20.6M]
  ------------------
  113|   133k|      above_ec = !!*(const uint32_t *)a;
  114|   133k|      left_ec = !!*(const uint64_t *)l;
  115|   133k|      break;
  116|   268k|    case TX_32X16:
  ------------------
  |  Branch (116:5): [True: 268k, False: 20.4M]
  ------------------
  117|   268k|      above_ec = !!*(const uint64_t *)a;
  118|   268k|      left_ec = !!*(const uint32_t *)l;
  119|   268k|      break;
  120|  2.61M|    case TX_8X8:
  ------------------
  |  Branch (120:5): [True: 2.61M, False: 18.1M]
  ------------------
  121|  2.61M|      above_ec = !!*(const uint16_t *)a;
  122|  2.61M|      left_ec = !!*(const uint16_t *)l;
  123|  2.61M|      break;
  124|  1.70M|    case TX_16X16:
  ------------------
  |  Branch (124:5): [True: 1.70M, False: 19.0M]
  ------------------
  125|  1.70M|      above_ec = !!*(const uint32_t *)a;
  126|  1.70M|      left_ec = !!*(const uint32_t *)l;
  127|  1.70M|      break;
  128|  1.43M|    case TX_32X32:
  ------------------
  |  Branch (128:5): [True: 1.43M, False: 19.3M]
  ------------------
  129|  1.43M|      above_ec = !!*(const uint64_t *)a;
  130|  1.43M|      left_ec = !!*(const uint64_t *)l;
  131|  1.43M|      break;
  132|      0|    case TX_64X64:
  ------------------
  |  Branch (132:5): [True: 0, False: 20.7M]
  ------------------
  133|      0|      above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
  134|      0|      left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
  135|      0|      break;
  136|      0|    case TX_32X64:
  ------------------
  |  Branch (136:5): [True: 0, False: 20.7M]
  ------------------
  137|      0|      above_ec = !!*(const uint64_t *)a;
  138|      0|      left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
  139|      0|      break;
  140|      0|    case TX_64X32:
  ------------------
  |  Branch (140:5): [True: 0, False: 20.7M]
  ------------------
  141|      0|      above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
  142|      0|      left_ec = !!*(const uint64_t *)l;
  143|      0|      break;
  144|   584k|    case TX_4X16:
  ------------------
  |  Branch (144:5): [True: 584k, False: 20.1M]
  ------------------
  145|   584k|      above_ec = a[0] != 0;
  146|   584k|      left_ec = !!*(const uint32_t *)l;
  147|   584k|      break;
  148|  2.03M|    case TX_16X4:
  ------------------
  |  Branch (148:5): [True: 2.03M, False: 18.7M]
  ------------------
  149|  2.03M|      above_ec = !!*(const uint32_t *)a;
  150|  2.03M|      left_ec = l[0] != 0;
  151|  2.03M|      break;
  152|   164k|    case TX_8X32:
  ------------------
  |  Branch (152:5): [True: 164k, False: 20.5M]
  ------------------
  153|   164k|      above_ec = !!*(const uint16_t *)a;
  154|   164k|      left_ec = !!*(const uint64_t *)l;
  155|   164k|      break;
  156|  1.05M|    case TX_32X8:
  ------------------
  |  Branch (156:5): [True: 1.05M, False: 19.7M]
  ------------------
  157|  1.05M|      above_ec = !!*(const uint64_t *)a;
  158|  1.05M|      left_ec = !!*(const uint16_t *)l;
  159|  1.05M|      break;
  160|      0|    case TX_16X64:
  ------------------
  |  Branch (160:5): [True: 0, False: 20.7M]
  ------------------
  161|      0|      above_ec = !!*(const uint32_t *)a;
  162|      0|      left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
  163|      0|      break;
  164|      0|    case TX_64X16:
  ------------------
  |  Branch (164:5): [True: 0, False: 20.7M]
  ------------------
  165|      0|      above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
  166|      0|      left_ec = !!*(const uint32_t *)l;
  167|      0|      break;
  168|      0|    default: assert(0 && "Invalid transform size."); break;
  ------------------
  |  Branch (168:5): [True: 0, False: 20.7M]
  ------------------
  169|  20.7M|  }
  170|  20.7M|  return combine_entropy_contexts(above_ec, left_ec);
  171|  20.7M|}
decodetxb.c:combine_entropy_contexts:
   83|  20.7M|                                           ENTROPY_CONTEXT b) {
   84|  20.7M|  return (a != 0) + (b != 0);
   85|  20.7M|}
decodetxb.c:get_txsize_entropy_ctx:
  173|  36.3M|static inline TX_SIZE get_txsize_entropy_ctx(TX_SIZE txsize) {
  174|  36.3M|  return (TX_SIZE)((txsize_sqr_map[txsize] + txsize_sqr_up_map[txsize] + 1) >>
  175|  36.3M|                   1);
  176|  36.3M|}

av1_get_palette_color_index_context:
  895|  26.1M|                                        uint8_t *color_order, int *color_idx) {
  896|  26.1M|  assert(palette_size <= PALETTE_MAX_SIZE);
  897|  26.1M|  assert(r > 0 || c > 0);
  898|       |
  899|       |  // Get color indices of neighbors.
  900|  26.1M|  int color_neighbors[NUM_PALETTE_NEIGHBORS];
  901|  26.1M|  color_neighbors[0] = (c - 1 >= 0) ? color_map[r * stride + c - 1] : -1;
  ------------------
  |  Branch (901:24): [True: 24.5M, False: 1.68M]
  ------------------
  902|  26.1M|  color_neighbors[1] =
  903|  26.1M|      (c - 1 >= 0 && r - 1 >= 0) ? color_map[(r - 1) * stride + c - 1] : -1;
  ------------------
  |  Branch (903:8): [True: 24.5M, False: 1.68M]
  |  Branch (903:22): [True: 22.5M, False: 1.97M]
  ------------------
  904|  26.1M|  color_neighbors[2] = (r - 1 >= 0) ? color_map[(r - 1) * stride + c] : -1;
  ------------------
  |  Branch (904:24): [True: 24.2M, False: 1.98M]
  ------------------
  905|       |
  906|       |  // The +10 below should not be needed. But we get a warning "array subscript
  907|       |  // is above array bounds [-Werror=array-bounds]" without it, possibly due to
  908|       |  // this (or similar) bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59124
  909|  26.1M|  int scores[PALETTE_MAX_SIZE + 10] = { 0 };
  910|  26.1M|  int i;
  911|  26.1M|  static const int weights[NUM_PALETTE_NEIGHBORS] = { 2, 1, 2 };
  912|   104M|  for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
  ------------------
  |  |   60|   104M|#define NUM_PALETTE_NEIGHBORS 3  // left, top-left and top.
  ------------------
  |  Branch (912:15): [True: 78.2M, False: 26.1M]
  ------------------
  913|  78.2M|    if (color_neighbors[i] >= 0) {
  ------------------
  |  Branch (913:9): [True: 70.9M, False: 7.28M]
  ------------------
  914|  70.9M|      scores[color_neighbors[i]] += weights[i];
  915|  70.9M|    }
  916|  78.2M|  }
  917|       |
  918|  26.1M|  int inverse_color_order[PALETTE_MAX_SIZE];
  919|   233M|  for (i = 0; i < PALETTE_MAX_SIZE; ++i) {
  ------------------
  |  |   63|   233M|#define PALETTE_MAX_SIZE 8
  ------------------
  |  Branch (919:15): [True: 207M, False: 26.1M]
  ------------------
  920|   207M|    color_order[i] = i;
  921|   207M|    inverse_color_order[i] = i;
  922|   207M|  }
  923|       |
  924|       |  // Get the top NUM_PALETTE_NEIGHBORS scores (sorted from large to small).
  925|   103M|  for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
  ------------------
  |  |   60|   103M|#define NUM_PALETTE_NEIGHBORS 3  // left, top-left and top.
  ------------------
  |  Branch (925:15): [True: 77.7M, False: 26.1M]
  ------------------
  926|  77.7M|    int max = scores[i];
  927|  77.7M|    int max_idx = i;
  928|   271M|    for (int j = i + 1; j < palette_size; ++j) {
  ------------------
  |  Branch (928:25): [True: 193M, False: 77.7M]
  ------------------
  929|   193M|      if (scores[j] > max) {
  ------------------
  |  Branch (929:11): [True: 31.4M, False: 161M]
  ------------------
  930|  31.4M|        max = scores[j];
  931|  31.4M|        max_idx = j;
  932|  31.4M|      }
  933|   193M|    }
  934|  77.7M|    if (max_idx != i) {
  ------------------
  |  Branch (934:9): [True: 28.1M, False: 49.5M]
  ------------------
  935|       |      // Move the score at index 'max_idx' to index 'i', and shift the scores
  936|       |      // from 'i' to 'max_idx - 1' by 1.
  937|  28.1M|      const int max_score = scores[max_idx];
  938|  28.1M|      const uint8_t max_color_order = color_order[max_idx];
  939|  93.6M|      for (int k = max_idx; k > i; --k) {
  ------------------
  |  Branch (939:29): [True: 65.4M, False: 28.1M]
  ------------------
  940|  65.4M|        scores[k] = scores[k - 1];
  941|  65.4M|        color_order[k] = color_order[k - 1];
  942|  65.4M|        inverse_color_order[color_order[k]] = k;
  943|  65.4M|      }
  944|  28.1M|      scores[i] = max_score;
  945|  28.1M|      color_order[i] = max_color_order;
  946|  28.1M|      inverse_color_order[color_order[i]] = i;
  947|  28.1M|    }
  948|  77.7M|  }
  949|       |
  950|  26.1M|  if (color_idx != NULL)
  ------------------
  |  Branch (950:7): [True: 0, False: 26.1M]
  ------------------
  951|      0|    *color_idx = inverse_color_order[color_map[r * stride + c]];
  952|       |
  953|       |  // Get hash value of context.
  954|  26.1M|  int color_index_ctx_hash = 0;
  955|  26.1M|  static const int hash_multipliers[NUM_PALETTE_NEIGHBORS] = { 1, 2, 2 };
  956|   104M|  for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
  ------------------
  |  |   60|   104M|#define NUM_PALETTE_NEIGHBORS 3  // left, top-left and top.
  ------------------
  |  Branch (956:15): [True: 78.4M, False: 26.1M]
  ------------------
  957|  78.4M|    color_index_ctx_hash += scores[i] * hash_multipliers[i];
  958|  78.4M|  }
  959|  26.1M|  assert(color_index_ctx_hash > 0);
  960|  26.1M|  assert(color_index_ctx_hash <= MAX_COLOR_CONTEXT_HASH);
  961|       |
  962|       |  // Lookup context from hash.
  963|  26.1M|  const int color_index_ctx =
  964|  26.1M|      av1_palette_color_index_context_lookup[color_index_ctx_hash];
  965|  26.1M|  assert(color_index_ctx >= 0);
  966|  26.1M|  assert(color_index_ctx < PALETTE_COLOR_INDEX_CONTEXTS);
  967|  26.1M|  return color_index_ctx;
  968|  26.1M|}
av1_init_mode_probs:
  970|   150k|void av1_init_mode_probs(FRAME_CONTEXT *fc) {
  971|   150k|  av1_copy(fc->palette_y_size_cdf, default_palette_y_size_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  972|   150k|  av1_copy(fc->palette_uv_size_cdf, default_palette_uv_size_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  973|   150k|  av1_copy(fc->palette_y_color_index_cdf, default_palette_y_color_index_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  974|   150k|  av1_copy(fc->palette_uv_color_index_cdf, default_palette_uv_color_index_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  975|   150k|  av1_copy(fc->kf_y_cdf, default_kf_y_mode_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  976|   150k|  av1_copy(fc->angle_delta_cdf, default_angle_delta_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  977|   150k|  av1_copy(fc->comp_inter_cdf, default_comp_inter_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  978|   150k|  av1_copy(fc->comp_ref_type_cdf, default_comp_ref_type_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  979|   150k|  av1_copy(fc->uni_comp_ref_cdf, default_uni_comp_ref_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  980|   150k|  av1_copy(fc->palette_y_mode_cdf, default_palette_y_mode_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  981|   150k|  av1_copy(fc->palette_uv_mode_cdf, default_palette_uv_mode_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  982|   150k|  av1_copy(fc->comp_ref_cdf, default_comp_ref_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  983|   150k|  av1_copy(fc->comp_bwdref_cdf, default_comp_bwdref_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  984|   150k|  av1_copy(fc->single_ref_cdf, default_single_ref_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  985|   150k|  av1_copy(fc->txfm_partition_cdf, default_txfm_partition_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  986|   150k|  av1_copy(fc->compound_index_cdf, default_compound_idx_cdfs);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  987|   150k|  av1_copy(fc->comp_group_idx_cdf, default_comp_group_idx_cdfs);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  988|   150k|  av1_copy(fc->newmv_cdf, default_newmv_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  989|   150k|  av1_copy(fc->zeromv_cdf, default_zeromv_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  990|   150k|  av1_copy(fc->refmv_cdf, default_refmv_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  991|   150k|  av1_copy(fc->drl_cdf, default_drl_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  992|   150k|  av1_copy(fc->motion_mode_cdf, default_motion_mode_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  993|   150k|  av1_copy(fc->obmc_cdf, default_obmc_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  994|   150k|  av1_copy(fc->inter_compound_mode_cdf, default_inter_compound_mode_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  995|   150k|  av1_copy(fc->compound_type_cdf, default_compound_type_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  996|   150k|  av1_copy(fc->wedge_idx_cdf, default_wedge_idx_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  997|   150k|  av1_copy(fc->interintra_cdf, default_interintra_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  998|   150k|  av1_copy(fc->wedge_interintra_cdf, default_wedge_interintra_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  999|   150k|  av1_copy(fc->interintra_mode_cdf, default_interintra_mode_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1000|   150k|  av1_copy(fc->seg.pred_cdf, default_segment_pred_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1001|   150k|  av1_copy(fc->filter_intra_cdfs, default_filter_intra_cdfs);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1002|   150k|  av1_copy(fc->filter_intra_mode_cdf, default_filter_intra_mode_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1003|   150k|  av1_copy(fc->switchable_restore_cdf, default_switchable_restore_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1004|   150k|  av1_copy(fc->wiener_restore_cdf, default_wiener_restore_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1005|   150k|  av1_copy(fc->sgrproj_restore_cdf, default_sgrproj_restore_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1006|   150k|  av1_copy(fc->y_mode_cdf, default_if_y_mode_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1007|   150k|  av1_copy(fc->uv_mode_cdf, default_uv_mode_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1008|   150k|  av1_copy(fc->switchable_interp_cdf, default_switchable_interp_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1009|   150k|  av1_copy(fc->partition_cdf, default_partition_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1010|   150k|  av1_copy(fc->intra_ext_tx_cdf, default_intra_ext_tx_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1011|   150k|  av1_copy(fc->inter_ext_tx_cdf, default_inter_ext_tx_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1012|   150k|  av1_copy(fc->skip_mode_cdfs, default_skip_mode_cdfs);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1013|   150k|  av1_copy(fc->skip_txfm_cdfs, default_skip_txfm_cdfs);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1014|   150k|  av1_copy(fc->intra_inter_cdf, default_intra_inter_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1015|   600k|  for (int i = 0; i < SPATIAL_PREDICTION_PROBS; i++)
  ------------------
  |  |   25|   600k|#define SPATIAL_PREDICTION_PROBS 3
  ------------------
  |  Branch (1015:19): [True: 450k, False: 150k]
  ------------------
 1016|   450k|    av1_copy(fc->seg.spatial_pred_seg_cdf[i],
  ------------------
  |  |   31|   600k|  do {                                   \
  |  |   32|   450k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   450k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   450k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1017|   150k|             default_spatial_pred_seg_tree_cdf[i]);
 1018|   150k|  av1_copy(fc->tx_size_cdf, default_tx_size_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1019|   150k|  av1_copy(fc->delta_q_cdf, default_delta_q_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1020|   150k|  av1_copy(fc->delta_lf_cdf, default_delta_lf_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1021|   150k|  av1_copy(fc->delta_lf_multi_cdf, default_delta_lf_multi_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1022|   150k|  av1_copy(fc->cfl_sign_cdf, default_cfl_sign_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1023|   150k|  av1_copy(fc->cfl_alpha_cdf, default_cfl_alpha_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1024|   150k|  av1_copy(fc->intrabc_cdf, default_intrabc_cdf);
  ------------------
  |  |   31|   150k|  do {                                   \
  |  |   32|   150k|    assert(sizeof(dest) == sizeof(src)); \
  |  |   33|   150k|    memcpy(dest, src, sizeof(src));      \
  |  |   34|   150k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1025|   150k|}
av1_set_default_ref_deltas:
 1027|   450k|void av1_set_default_ref_deltas(int8_t *ref_deltas) {
 1028|   450k|  assert(ref_deltas != NULL);
 1029|       |
 1030|   450k|  ref_deltas[INTRA_FRAME] = 1;
 1031|   450k|  ref_deltas[LAST_FRAME] = 0;
 1032|   450k|  ref_deltas[LAST2_FRAME] = ref_deltas[LAST_FRAME];
 1033|   450k|  ref_deltas[LAST3_FRAME] = ref_deltas[LAST_FRAME];
 1034|   450k|  ref_deltas[BWDREF_FRAME] = ref_deltas[LAST_FRAME];
 1035|   450k|  ref_deltas[GOLDEN_FRAME] = -1;
 1036|   450k|  ref_deltas[ALTREF2_FRAME] = -1;
 1037|   450k|  ref_deltas[ALTREF_FRAME] = -1;
 1038|   450k|}
av1_set_default_mode_deltas:
 1040|   450k|void av1_set_default_mode_deltas(int8_t *mode_deltas) {
 1041|   450k|  assert(mode_deltas != NULL);
 1042|       |
 1043|   450k|  mode_deltas[0] = 0;
 1044|   450k|  mode_deltas[1] = 0;
 1045|   450k|}
av1_setup_frame_contexts:
 1055|   150k|void av1_setup_frame_contexts(AV1_COMMON *cm) {
 1056|       |  // Store the frame context into a special slot (not associated with any
 1057|       |  // reference buffer), so that we can set up cm->pre_fc correctly later
 1058|       |  // This function must ONLY be called when cm->fc has been initialized with
 1059|       |  // default probs, either by av1_setup_past_independence or after manually
 1060|       |  // initializing them
 1061|   150k|  *cm->default_frame_context = *cm->fc;
 1062|       |  // TODO(jack.haughton@argondesign.com): don't think this should be necessary,
 1063|       |  // but could do with fuller testing
 1064|   150k|  if (cm->tiles.large_scale) {
  ------------------
  |  Branch (1064:7): [True: 34.5k, False: 115k]
  ------------------
 1065|   276k|    for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
  ------------------
  |  Branch (1065:30): [True: 241k, False: 34.5k]
  ------------------
 1066|   241k|      RefCntBuffer *const buf = get_ref_frame_buf(cm, i);
 1067|   241k|      if (buf != NULL) buf->frame_context = *cm->fc;
  ------------------
  |  Branch (1067:11): [True: 73.2k, False: 168k]
  ------------------
 1068|   241k|    }
 1069|   586k|    for (int i = 0; i < cm->buffer_pool->num_frame_bufs; ++i)
  ------------------
  |  Branch (1069:21): [True: 552k, False: 34.5k]
  ------------------
 1070|   552k|      cm->buffer_pool->frame_bufs[i].frame_context = *cm->fc;
 1071|  34.5k|  }
 1072|   150k|}
av1_setup_past_independence:
 1074|   150k|void av1_setup_past_independence(AV1_COMMON *cm) {
 1075|       |  // Reset the segment feature data to the default stats:
 1076|       |  // Features disabled, 0, with delta coding (Default state).
 1077|   150k|  av1_clearall_segfeatures(&cm->seg);
 1078|       |
 1079|   150k|  if (cm->cur_frame->seg_map) {
  ------------------
  |  Branch (1079:7): [True: 150k, False: 0]
  ------------------
 1080|   150k|    memset(cm->cur_frame->seg_map, 0,
 1081|   150k|           (cm->cur_frame->mi_rows * cm->cur_frame->mi_cols));
 1082|   150k|  }
 1083|       |
 1084|       |  // reset mode ref deltas
 1085|   150k|  av1_set_default_ref_deltas(cm->cur_frame->ref_deltas);
 1086|   150k|  av1_set_default_mode_deltas(cm->cur_frame->mode_deltas);
 1087|   150k|  set_default_lf_deltas(&cm->lf);
 1088|       |
 1089|   150k|  av1_default_coef_probs(cm);
 1090|   150k|  av1_init_mode_probs(cm->fc);
 1091|   150k|  av1_init_mv_probs(cm);
 1092|   150k|  cm->fc->initialized = 1;
 1093|   150k|  av1_setup_frame_contexts(cm);
 1094|   150k|}
entropymode.c:set_default_lf_deltas:
 1047|   150k|static void set_default_lf_deltas(struct loopfilter *lf) {
 1048|   150k|  lf->mode_ref_delta_enabled = 1;
 1049|   150k|  lf->mode_ref_delta_update = 1;
 1050|       |
 1051|   150k|  av1_set_default_ref_deltas(lf->ref_deltas);
 1052|   150k|  av1_set_default_mode_deltas(lf->mode_deltas);
 1053|   150k|}

av1_init_mv_probs:
   63|   150k|void av1_init_mv_probs(AV1_COMMON *cm) {
   64|       |  // NB: this sets CDFs too
   65|   150k|  cm->fc->nmvc = default_nmv_context;
   66|   150k|  cm->fc->ndvc = default_nmv_context;
   67|   150k|}

decodemv.c:mv_joint_vertical:
   40|  2.25M|static inline int mv_joint_vertical(MV_JOINT_TYPE type) {
   41|  2.25M|  return type == MV_JOINT_HZVNZ || type == MV_JOINT_HNZVNZ;
  ------------------
  |  Branch (41:10): [True: 509k, False: 1.74M]
  |  Branch (41:36): [True: 754k, False: 990k]
  ------------------
   42|  2.25M|}
decodemv.c:mv_joint_horizontal:
   44|  2.25M|static inline int mv_joint_horizontal(MV_JOINT_TYPE type) {
   45|  2.25M|  return type == MV_JOINT_HNZVZ || type == MV_JOINT_HNZVNZ;
  ------------------
  |  Branch (45:10): [True: 370k, False: 1.88M]
  |  Branch (45:36): [True: 754k, False: 1.12M]
  ------------------
   46|  2.25M|}

decodeframe.c:av1_get_interp_filter_params_with_block_size:
  249|  33.7M|                                             const int w) {
  250|  33.7M|  if (w <= 4 && interp_filter != MULTITAP_SHARP2)
  ------------------
  |  Branch (250:7): [True: 13.8M, False: 19.9M]
  |  Branch (250:17): [True: 13.8M, False: 18.4E]
  ------------------
  251|  13.8M|    return &av1_interp_4tap[interp_filter];
  252|  19.9M|  return &av1_interp_filter_params_list[interp_filter];
  253|  33.7M|}
decodemv.c:av1_broadcast_interp_filter:
   86|  1.61M|    InterpFilter filter) {
   87|  1.61M|  int_interpfilters filters;
   88|  1.61M|  filters.as_filters.x_filter = filter;
   89|  1.61M|  filters.as_filters.y_filter = filter;
   90|  1.61M|  return filters;
   91|  1.61M|}
decodemv.c:av1_unswitchable_filter:
   93|   897k|static inline InterpFilter av1_unswitchable_filter(InterpFilter filter) {
   94|   897k|  return filter == SWITCHABLE ? EIGHTTAP_REGULAR : filter;
  ------------------
  |  Branch (94:10): [True: 659k, False: 237k]
  ------------------
   95|   897k|}
highbd_convolve_avx2.c:av1_get_interp_filter_subpel_kernel:
  266|  1.91M|    const InterpFilterParams *const filter_params, const int subpel) {
  267|  1.91M|  return filter_params->filter_ptr + filter_params->taps * subpel;
  268|  1.91M|}
pred_common.c:av1_extract_interp_filter:
   80|  4.53M|                                                     int dir) {
   81|  4.53M|  return (InterpFilter)((dir) ? filters.as_filters.x_filter
  ------------------
  |  Branch (81:25): [True: 132k, False: 4.40M]
  ------------------
   82|  4.53M|                              : filters.as_filters.y_filter);
   83|  4.53M|}
av1_convolve_scale_sse4.c:av1_get_interp_filter_subpel_kernel:
  266|  45.6M|    const InterpFilterParams *const filter_params, const int subpel) {
  267|  45.6M|  return filter_params->filter_ptr + filter_params->taps * subpel;
  268|  45.6M|}
convolve_2d_avx2.c:get_filter_tap:
  298|  7.28M|                                 int subpel_qn) {
  299|  7.28M|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  300|  7.28M|      filter_params, subpel_qn & SUBPEL_MASK);
  ------------------
  |  |   24|  7.28M|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  7.28M|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  301|  7.28M|  if (filter_params->taps == 12) {
  ------------------
  |  Branch (301:7): [True: 0, False: 7.28M]
  ------------------
  302|      0|    return 12;
  303|      0|  }
  304|  7.28M|  if (filter[0] | filter[7]) {
  ------------------
  |  Branch (304:7): [True: 212k, False: 7.06M]
  ------------------
  305|   212k|    return 8;
  306|   212k|  }
  307|  7.06M|  if (filter[1] | filter[6]) {
  ------------------
  |  Branch (307:7): [True: 3.41M, False: 3.65M]
  ------------------
  308|  3.41M|    return 6;
  309|  3.41M|  }
  310|  3.65M|#if CONFIG_SVT_AV1
  311|  3.65M|  if (filter[2] | filter[5]) {
  ------------------
  |  Branch (311:7): [True: 3.37M, False: 280k]
  ------------------
  312|  3.37M|    return 4;
  313|  3.37M|  }
  314|   280k|  return 2;
  315|       |#else
  316|       |  return 4;
  317|       |#endif
  318|  3.65M|}
convolve_2d_avx2.c:av1_get_interp_filter_subpel_kernel:
  266|  10.9M|    const InterpFilterParams *const filter_params, const int subpel) {
  267|  10.9M|  return filter_params->filter_ptr + filter_params->taps * subpel;
  268|  10.9M|}
convolve_avx2.c:get_filter_tap:
  298|  3.15M|                                 int subpel_qn) {
  299|  3.15M|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  300|  3.15M|      filter_params, subpel_qn & SUBPEL_MASK);
  ------------------
  |  |   24|  3.15M|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  3.15M|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  301|  3.15M|  if (filter_params->taps == 12) {
  ------------------
  |  Branch (301:7): [True: 0, False: 3.15M]
  ------------------
  302|      0|    return 12;
  303|      0|  }
  304|  3.15M|  if (filter[0] | filter[7]) {
  ------------------
  |  Branch (304:7): [True: 78.4k, False: 3.07M]
  ------------------
  305|  78.4k|    return 8;
  306|  78.4k|  }
  307|  3.07M|  if (filter[1] | filter[6]) {
  ------------------
  |  Branch (307:7): [True: 1.54M, False: 1.52M]
  ------------------
  308|  1.54M|    return 6;
  309|  1.54M|  }
  310|  1.52M|#if CONFIG_SVT_AV1
  311|  1.52M|  if (filter[2] | filter[5]) {
  ------------------
  |  Branch (311:7): [True: 1.36M, False: 163k]
  ------------------
  312|  1.36M|    return 4;
  313|  1.36M|  }
  314|   163k|  return 2;
  315|       |#else
  316|       |  return 4;
  317|       |#endif
  318|  1.52M|}
convolve_avx2.c:av1_get_interp_filter_subpel_kernel:
  266|  4.72M|    const InterpFilterParams *const filter_params, const int subpel) {
  267|  4.72M|  return filter_params->filter_ptr + filter_params->taps * subpel;
  268|  4.72M|}
jnt_convolve_avx2.c:av1_get_interp_filter_subpel_kernel:
  266|   708k|    const InterpFilterParams *const filter_params, const int subpel) {
  267|   708k|  return filter_params->filter_ptr + filter_params->taps * subpel;
  268|   708k|}
highbd_convolve_2d_avx2.c:av1_get_interp_filter_subpel_kernel:
  266|  5.98M|    const InterpFilterParams *const filter_params, const int subpel) {
  267|  5.98M|  return filter_params->filter_ptr + filter_params->taps * subpel;
  268|  5.98M|}
highbd_jnt_convolve_avx2.c:av1_get_interp_filter_subpel_kernel:
  266|  1.96M|    const InterpFilterParams *const filter_params, const int subpel) {
  267|  1.96M|  return filter_params->filter_ptr + filter_params->taps * subpel;
  268|  1.96M|}

av1_alloc_internal_frame_buffers:
   17|  16.1k|int av1_alloc_internal_frame_buffers(InternalFrameBufferList *list) {
   18|  16.1k|  assert(list != NULL);
   19|  16.1k|  av1_free_internal_frame_buffers(list);
   20|       |
   21|  16.1k|  list->num_internal_frame_buffers =
   22|  16.1k|      AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
  ------------------
  |  |   34|  16.1k|#define AOM_MAXIMUM_REF_BUFFERS 8
  ------------------
                    AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
  ------------------
  |  |   30|  16.1k|#define AOM_MAXIMUM_WORK_BUFFERS 8
  ------------------
   23|  16.1k|  list->int_fb = (InternalFrameBuffer *)aom_calloc(
   24|  16.1k|      list->num_internal_frame_buffers, sizeof(*list->int_fb));
   25|  16.1k|  if (list->int_fb == NULL) {
  ------------------
  |  Branch (25:7): [True: 0, False: 16.1k]
  ------------------
   26|      0|    list->num_internal_frame_buffers = 0;
   27|      0|    return 1;
   28|      0|  }
   29|  16.1k|  return 0;
   30|  16.1k|}
av1_free_internal_frame_buffers:
   32|  32.2k|void av1_free_internal_frame_buffers(InternalFrameBufferList *list) {
   33|  32.2k|  int i;
   34|       |
   35|  32.2k|  assert(list != NULL);
   36|       |
   37|   289k|  for (i = 0; i < list->num_internal_frame_buffers; ++i) {
  ------------------
  |  Branch (37:15): [True: 257k, False: 32.2k]
  ------------------
   38|   257k|    aom_free(list->int_fb[i].data);
   39|   257k|    list->int_fb[i].data = NULL;
   40|   257k|  }
   41|  32.2k|  aom_free(list->int_fb);
   42|  32.2k|  list->int_fb = NULL;
   43|  32.2k|  list->num_internal_frame_buffers = 0;
   44|  32.2k|}
av1_zero_unused_internal_frame_buffers:
   46|  27.5k|void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list) {
   47|  27.5k|  int i;
   48|       |
   49|  27.5k|  assert(list != NULL);
   50|       |
   51|   468k|  for (i = 0; i < list->num_internal_frame_buffers; ++i) {
  ------------------
  |  Branch (51:15): [True: 441k, False: 27.5k]
  ------------------
   52|   441k|    if (list->int_fb[i].data && !list->int_fb[i].in_use)
  ------------------
  |  Branch (52:9): [True: 85.2k, False: 355k]
  |  Branch (52:33): [True: 80.2k, False: 5.04k]
  ------------------
   53|  80.2k|      memset(list->int_fb[i].data, 0, list->int_fb[i].size);
   54|   441k|  }
   55|  27.5k|}
av1_get_frame_buffer:
   58|   498k|                         aom_codec_frame_buffer_t *fb) {
   59|   498k|  int i;
   60|   498k|  InternalFrameBufferList *const int_fb_list =
   61|   498k|      (InternalFrameBufferList *)cb_priv;
   62|   498k|  if (int_fb_list == NULL) return -1;
  ------------------
  |  Branch (62:7): [True: 0, False: 498k]
  ------------------
   63|       |
   64|       |  // Find a free frame buffer.
   65|  1.91M|  for (i = 0; i < int_fb_list->num_internal_frame_buffers; ++i) {
  ------------------
  |  Branch (65:15): [True: 1.91M, False: 0]
  ------------------
   66|  1.91M|    if (!int_fb_list->int_fb[i].in_use) break;
  ------------------
  |  Branch (66:9): [True: 498k, False: 1.42M]
  ------------------
   67|  1.91M|  }
   68|       |
   69|   498k|  if (i == int_fb_list->num_internal_frame_buffers) return -1;
  ------------------
  |  Branch (69:7): [True: 0, False: 498k]
  ------------------
   70|       |
   71|   498k|  if (int_fb_list->int_fb[i].size < min_size) {
  ------------------
  |  Branch (71:7): [True: 51.2k, False: 447k]
  ------------------
   72|  51.2k|    aom_free(int_fb_list->int_fb[i].data);
   73|       |    // The data must be zeroed to fix a valgrind error from the C loop filter
   74|       |    // due to access uninitialized memory in frame border. It could be
   75|       |    // skipped if border were totally removed.
   76|  51.2k|    int_fb_list->int_fb[i].data = (uint8_t *)aom_calloc(1, min_size);
   77|  51.2k|    if (!int_fb_list->int_fb[i].data) {
  ------------------
  |  Branch (77:9): [True: 0, False: 51.2k]
  ------------------
   78|      0|      int_fb_list->int_fb[i].size = 0;
   79|      0|      return -1;
   80|      0|    }
   81|  51.2k|    int_fb_list->int_fb[i].size = min_size;
   82|  51.2k|  }
   83|       |
   84|   498k|  fb->data = int_fb_list->int_fb[i].data;
   85|   498k|  fb->size = int_fb_list->int_fb[i].size;
   86|   498k|  int_fb_list->int_fb[i].in_use = 1;
   87|       |
   88|       |  // Set the frame buffer's private data to point at the internal frame buffer.
   89|   498k|  fb->priv = &int_fb_list->int_fb[i];
   90|   498k|  return 0;
   91|   498k|}
av1_release_frame_buffer:
   93|   498k|int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb) {
   94|   498k|  InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv;
   95|   498k|  (void)cb_priv;
   96|   498k|  if (int_fb) int_fb->in_use = 0;
  ------------------
  |  Branch (96:7): [True: 498k, False: 0]
  ------------------
   97|   498k|  return 0;
   98|   498k|}

av1_get_tx_scale:
   24|  36.0M|int av1_get_tx_scale(const TX_SIZE tx_size) {
   25|  36.0M|  const int pels = tx_size_2d[tx_size];
   26|       |  // Largest possible pels is 4096 (64x64).
   27|  36.0M|  return (pels > 256) + (pels > 1024);
   28|  36.0M|}
av1_highbd_iwht4x4_add:
   35|  1.04M|                            int eob, int bd) {
   36|  1.04M|  if (eob > 1)
  ------------------
  |  Branch (36:7): [True: 413k, False: 630k]
  ------------------
   37|   413k|    av1_highbd_iwht4x4_16_add(input, dest, stride, bd);
   38|   630k|  else
   39|   630k|    av1_highbd_iwht4x4_1_add(input, dest, stride, bd);
  ------------------
  |  |  281|   630k|#define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c
  ------------------
   40|  1.04M|}
av1_inv_txfm_add_c:
  297|   913k|                        const TxfmParam *txfm_param) {
  298|   913k|  const TX_SIZE tx_size = txfm_param->tx_size;
  299|   913k|  DECLARE_ALIGNED(32, uint16_t, tmp[MAX_TX_SQUARE]);
  ------------------
  |  |   19|   913k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
  300|   913k|  int tmp_stride = MAX_TX_SIZE;
  ------------------
  |  |  183|   913k|#define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |  182|   913k|#define MAX_TX_SIZE_LOG2 (6)
  |  |  ------------------
  ------------------
  301|   913k|  int w = tx_size_wide[tx_size];
  302|   913k|  int h = tx_size_high[tx_size];
  303|  4.56M|  for (int r = 0; r < h; ++r) {
  ------------------
  |  Branch (303:19): [True: 3.65M, False: 913k]
  ------------------
  304|  18.2M|    for (int c = 0; c < w; ++c) {
  ------------------
  |  Branch (304:21): [True: 14.6M, False: 3.65M]
  ------------------
  305|  14.6M|      tmp[r * tmp_stride + c] = dst[r * stride + c];
  306|  14.6M|    }
  307|  3.65M|  }
  308|       |
  309|   913k|  av1_highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride,
  ------------------
  |  |   76|   913k|#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
  ------------------
  310|   913k|                          txfm_param);
  311|       |
  312|  4.56M|  for (int r = 0; r < h; ++r) {
  ------------------
  |  Branch (312:19): [True: 3.65M, False: 913k]
  ------------------
  313|  18.2M|    for (int c = 0; c < w; ++c) {
  ------------------
  |  Branch (313:21): [True: 14.6M, False: 3.65M]
  ------------------
  314|  14.6M|      dst[r * stride + c] = (uint8_t)tmp[r * tmp_stride + c];
  315|  14.6M|    }
  316|  3.65M|  }
  317|   913k|}
av1_inverse_transform_block:
  322|  19.3M|                                 int stride, int eob, int reduced_tx_set) {
  323|  19.3M|  if (!eob) return;
  ------------------
  |  Branch (323:7): [True: 3.20M, False: 16.1M]
  ------------------
  324|       |
  325|  16.1M|  assert(eob <= av1_get_max_eob(tx_size));
  326|       |
  327|  16.1M|  TxfmParam txfm_param;
  328|  16.1M|  init_txfm_param(xd, plane, tx_size, tx_type, eob, reduced_tx_set,
  329|  16.1M|                  &txfm_param);
  330|  16.1M|  assert(av1_ext_tx_used[txfm_param.tx_set_type][txfm_param.tx_type]);
  331|       |
  332|  16.1M|  if (txfm_param.is_hbd) {
  ------------------
  |  Branch (332:7): [True: 9.18M, False: 6.94M]
  ------------------
  333|  9.18M|    av1_highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
  334|  9.18M|  } else {
  335|  6.94M|    av1_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
  336|  6.94M|  }
  337|  16.1M|}
idct.c:init_txfm_param:
  215|  16.1M|                            TxfmParam *txfm_param) {
  216|  16.1M|  (void)plane;
  217|  16.1M|  txfm_param->tx_type = tx_type;
  218|  16.1M|  txfm_param->tx_size = tx_size;
  219|  16.1M|  txfm_param->eob = eob;
  220|  16.1M|  txfm_param->lossless = xd->lossless[xd->mi[0]->segment_id];
  221|  16.1M|  txfm_param->bd = xd->bd;
  222|  16.1M|  txfm_param->is_hbd = is_cur_buf_hbd(xd);
  223|  16.1M|  txfm_param->tx_set_type = av1_get_ext_tx_set_type(
  224|  16.1M|      txfm_param->tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
  225|  16.1M|}

highbd_inv_txfm_sse4.c:cast_to_int32:
   42|  2.37M|static inline const int32_t *cast_to_int32(const tran_low_t *input) {
   43|  2.37M|  assert(sizeof(int32_t) == sizeof(tran_low_t));
   44|  2.37M|  return (const int32_t *)input;
   45|  2.37M|}

decodeframe.c:clamp_mv:
  323|  17.0M|static inline void clamp_mv(MV *mv, const SubpelMvLimits *mv_limits) {
  324|  17.0M|  mv->col = clamp(mv->col, mv_limits->col_min, mv_limits->col_max);
  325|  17.0M|  mv->row = clamp(mv->row, mv_limits->row_min, mv_limits->row_max);
  326|  17.0M|}
decodemv.c:convert_fullmv_to_mv:
   91|  27.8k|static inline void convert_fullmv_to_mv(int_mv *mv) {
   92|  27.8k|  mv->as_mv = get_mv_from_fullmv(&mv->as_fullmv);
   93|  27.8k|}
decodemv.c:get_mv_from_fullmv:
   85|  27.8k|static inline MV get_mv_from_fullmv(const FULLPEL_MV *full_mv) {
   86|  27.8k|  const MV subpel_mv = { (int16_t)GET_MV_SUBPEL(full_mv->row),
  ------------------
  |  |   29|  27.8k|#define GET_MV_SUBPEL(x) ((x)*8)
  ------------------
   87|  27.8k|                         (int16_t)GET_MV_SUBPEL(full_mv->col) };
  ------------------
  |  |   29|  27.8k|#define GET_MV_SUBPEL(x) ((x)*8)
  ------------------
   88|  27.8k|  return subpel_mv;
   89|  27.8k|}
decodemv.c:integer_mv_precision:
  199|   255k|static inline void integer_mv_precision(MV *mv) {
  200|   255k|  int mod = (mv->row % 8);
  201|   255k|  if (mod != 0) {
  ------------------
  |  Branch (201:7): [True: 20.1k, False: 235k]
  ------------------
  202|  20.1k|    mv->row -= mod;
  203|  20.1k|    if (abs(mod) > 4) {
  ------------------
  |  Branch (203:9): [True: 7.42k, False: 12.6k]
  ------------------
  204|  7.42k|      if (mod > 0) {
  ------------------
  |  Branch (204:11): [True: 3.20k, False: 4.22k]
  ------------------
  205|  3.20k|        mv->row += 8;
  206|  4.22k|      } else {
  207|  4.22k|        mv->row -= 8;
  208|  4.22k|      }
  209|  7.42k|    }
  210|  20.1k|  }
  211|       |
  212|   255k|  mod = (mv->col % 8);
  213|   255k|  if (mod != 0) {
  ------------------
  |  Branch (213:7): [True: 19.7k, False: 235k]
  ------------------
  214|  19.7k|    mv->col -= mod;
  215|  19.7k|    if (abs(mod) > 4) {
  ------------------
  |  Branch (215:9): [True: 5.63k, False: 14.1k]
  ------------------
  216|  5.63k|      if (mod > 0) {
  ------------------
  |  Branch (216:11): [True: 3.00k, False: 2.63k]
  ------------------
  217|  3.00k|        mv->col += 8;
  218|  3.00k|      } else {
  219|  2.63k|        mv->col -= 8;
  220|  2.63k|      }
  221|  5.63k|    }
  222|  19.7k|  }
  223|   255k|}
decodemv.c:gm_get_motion_vector:
  234|   895k|                                          int is_integer) {
  235|   895k|  int_mv res;
  236|       |
  237|   895k|  if (gm->wmtype == IDENTITY) {
  ------------------
  |  Branch (237:7): [True: 616k, False: 278k]
  ------------------
  238|   616k|    res.as_int = 0;
  239|   616k|    return res;
  240|   616k|  }
  241|       |
  242|   278k|  const int32_t *mat = gm->wmmat;
  243|   278k|  int x, y, tx, ty;
  244|       |
  245|   278k|  if (gm->wmtype == TRANSLATION) {
  ------------------
  |  Branch (245:7): [True: 57.3k, False: 221k]
  ------------------
  246|       |    // All global motion vectors are stored with WARPEDMODEL_PREC_BITS (16)
  247|       |    // bits of fractional precision. The offset for a translation is stored in
  248|       |    // entries 0 and 1. For translations, all but the top three (two if
  249|       |    // cm->features.allow_high_precision_mv is false) fractional bits are always
  250|       |    // zero.
  251|       |    //
  252|       |    // After the right shifts, there are 3 fractional bits of precision. If
  253|       |    // allow_hp is false, the bottom bit is always zero (so we don't need a
  254|       |    // call to convert_to_trans_prec here)
  255|       |    //
  256|       |    // Note: There is an AV1 specification bug here:
  257|       |    //
  258|       |    // gm->wmmat[0] is supposed to be the horizontal translation, and so should
  259|       |    // go into res.as_mv.col, and gm->wmmat[1] is supposed to be the vertical
  260|       |    // translation and so should go into res.as_mv.row
  261|       |    //
  262|       |    // However, in the spec, these assignments are accidentally reversed, and so
  263|       |    // we must keep this incorrect logic to match the spec.
  264|       |    //
  265|       |    // See also: https://crbug.com/aomedia/3328
  266|  57.3k|    res.as_mv.row = gm->wmmat[0] >> GM_TRANS_ONLY_PREC_DIFF;
  ------------------
  |  |  168|  57.3k|#define GM_TRANS_ONLY_PREC_DIFF (WARPEDMODEL_PREC_BITS - 3)
  |  |  ------------------
  |  |  |  |   96|  57.3k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  ------------------
  267|  57.3k|    res.as_mv.col = gm->wmmat[1] >> GM_TRANS_ONLY_PREC_DIFF;
  ------------------
  |  |  168|  57.3k|#define GM_TRANS_ONLY_PREC_DIFF (WARPEDMODEL_PREC_BITS - 3)
  |  |  ------------------
  |  |  |  |   96|  57.3k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  ------------------
  268|  57.3k|    assert(IMPLIES(1 & (res.as_mv.row | res.as_mv.col), allow_hp));
  269|  57.3k|    if (is_integer) {
  ------------------
  |  Branch (269:9): [True: 8.31k, False: 49.0k]
  ------------------
  270|  8.31k|      integer_mv_precision(&res.as_mv);
  271|  8.31k|    }
  272|  57.3k|    return res;
  273|  57.3k|  }
  274|       |
  275|   221k|  x = block_center_x(mi_col, bsize);
  276|   221k|  y = block_center_y(mi_row, bsize);
  277|       |
  278|   221k|  if (gm->wmtype == ROTZOOM) {
  ------------------
  |  Branch (278:7): [True: 145k, False: 75.7k]
  ------------------
  279|   145k|    assert(gm->wmmat[5] == gm->wmmat[2]);
  280|   145k|    assert(gm->wmmat[4] == -gm->wmmat[3]);
  281|   145k|  }
  282|       |
  283|   221k|  const int xc =
  284|   221k|      (mat[2] - (1 << WARPEDMODEL_PREC_BITS)) * x + mat[3] * y + mat[0];
  ------------------
  |  |   96|   221k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
  285|   221k|  const int yc =
  286|   221k|      mat[4] * x + (mat[5] - (1 << WARPEDMODEL_PREC_BITS)) * y + mat[1];
  ------------------
  |  |   96|   221k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
  287|   221k|  tx = convert_to_trans_prec(allow_hp, xc);
  288|   221k|  ty = convert_to_trans_prec(allow_hp, yc);
  289|       |
  290|   221k|  res.as_mv.row = ty;
  291|   221k|  res.as_mv.col = tx;
  292|       |
  293|   221k|  if (is_integer) {
  ------------------
  |  Branch (293:7): [True: 21.1k, False: 200k]
  ------------------
  294|  21.1k|    integer_mv_precision(&res.as_mv);
  295|  21.1k|  }
  296|   221k|  return res;
  297|   221k|}
decodemv.c:block_center_x:
  183|   221k|static inline int block_center_x(int mi_col, BLOCK_SIZE bs) {
  184|   221k|  const int bw = block_size_wide[bs];
  185|   221k|  return mi_col * MI_SIZE + bw / 2 - 1;
  ------------------
  |  |   40|   221k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   221k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  186|   221k|}
decodemv.c:block_center_y:
  188|   221k|static inline int block_center_y(int mi_row, BLOCK_SIZE bs) {
  189|   221k|  const int bh = block_size_high[bs];
  190|   221k|  return mi_row * MI_SIZE + bh / 2 - 1;
  ------------------
  |  |   40|   221k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   221k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  191|   221k|}
decodemv.c:convert_to_trans_prec:
  193|   442k|static inline int convert_to_trans_prec(int allow_hp, int coor) {
  194|   442k|  if (allow_hp)
  ------------------
  |  Branch (194:7): [True: 132k, False: 310k]
  ------------------
  195|   132k|    return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 3);
  ------------------
  |  |   45|   132k|  (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   41|  50.8k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (45:4): [True: 50.8k, False: 81.3k]
  |  |  ------------------
  |  |   46|   132k|                 : ROUND_POWER_OF_TWO((value), (n)))
  |  |  ------------------
  |  |  |  |   41|  81.3k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  196|   310k|  else
  197|   310k|    return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 2) * 2;
  ------------------
  |  |   45|   310k|  (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   41|   176k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (45:4): [True: 176k, False: 134k]
  |  |  ------------------
  |  |   46|   310k|                 : ROUND_POWER_OF_TWO((value), (n)))
  |  |  ------------------
  |  |  |  |   41|   134k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  198|   442k|}
mvref_common.c:gm_get_motion_vector:
  234|  5.18M|                                          int is_integer) {
  235|  5.18M|  int_mv res;
  236|       |
  237|  5.18M|  if (gm->wmtype == IDENTITY) {
  ------------------
  |  Branch (237:7): [True: 4.24M, False: 944k]
  ------------------
  238|  4.24M|    res.as_int = 0;
  239|  4.24M|    return res;
  240|  4.24M|  }
  241|       |
  242|   944k|  const int32_t *mat = gm->wmmat;
  243|   944k|  int x, y, tx, ty;
  244|       |
  245|   944k|  if (gm->wmtype == TRANSLATION) {
  ------------------
  |  Branch (245:7): [True: 139k, False: 805k]
  ------------------
  246|       |    // All global motion vectors are stored with WARPEDMODEL_PREC_BITS (16)
  247|       |    // bits of fractional precision. The offset for a translation is stored in
  248|       |    // entries 0 and 1. For translations, all but the top three (two if
  249|       |    // cm->features.allow_high_precision_mv is false) fractional bits are always
  250|       |    // zero.
  251|       |    //
  252|       |    // After the right shifts, there are 3 fractional bits of precision. If
  253|       |    // allow_hp is false, the bottom bit is always zero (so we don't need a
  254|       |    // call to convert_to_trans_prec here)
  255|       |    //
  256|       |    // Note: There is an AV1 specification bug here:
  257|       |    //
  258|       |    // gm->wmmat[0] is supposed to be the horizontal translation, and so should
  259|       |    // go into res.as_mv.col, and gm->wmmat[1] is supposed to be the vertical
  260|       |    // translation and so should go into res.as_mv.row
  261|       |    //
  262|       |    // However, in the spec, these assignments are accidentally reversed, and so
  263|       |    // we must keep this incorrect logic to match the spec.
  264|       |    //
  265|       |    // See also: https://crbug.com/aomedia/3328
  266|   139k|    res.as_mv.row = gm->wmmat[0] >> GM_TRANS_ONLY_PREC_DIFF;
  ------------------
  |  |  168|   139k|#define GM_TRANS_ONLY_PREC_DIFF (WARPEDMODEL_PREC_BITS - 3)
  |  |  ------------------
  |  |  |  |   96|   139k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  ------------------
  267|   139k|    res.as_mv.col = gm->wmmat[1] >> GM_TRANS_ONLY_PREC_DIFF;
  ------------------
  |  |  168|   139k|#define GM_TRANS_ONLY_PREC_DIFF (WARPEDMODEL_PREC_BITS - 3)
  |  |  ------------------
  |  |  |  |   96|   139k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  ------------------
  268|   139k|    assert(IMPLIES(1 & (res.as_mv.row | res.as_mv.col), allow_hp));
  269|   139k|    if (is_integer) {
  ------------------
  |  Branch (269:9): [True: 25.3k, False: 113k]
  ------------------
  270|  25.3k|      integer_mv_precision(&res.as_mv);
  271|  25.3k|    }
  272|   139k|    return res;
  273|   139k|  }
  274|       |
  275|   805k|  x = block_center_x(mi_col, bsize);
  276|   805k|  y = block_center_y(mi_row, bsize);
  277|       |
  278|   805k|  if (gm->wmtype == ROTZOOM) {
  ------------------
  |  Branch (278:7): [True: 355k, False: 449k]
  ------------------
  279|   355k|    assert(gm->wmmat[5] == gm->wmmat[2]);
  280|   355k|    assert(gm->wmmat[4] == -gm->wmmat[3]);
  281|   355k|  }
  282|       |
  283|   805k|  const int xc =
  284|   805k|      (mat[2] - (1 << WARPEDMODEL_PREC_BITS)) * x + mat[3] * y + mat[0];
  ------------------
  |  |   96|   805k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
  285|   805k|  const int yc =
  286|   805k|      mat[4] * x + (mat[5] - (1 << WARPEDMODEL_PREC_BITS)) * y + mat[1];
  ------------------
  |  |   96|   805k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
  287|   805k|  tx = convert_to_trans_prec(allow_hp, xc);
  288|   805k|  ty = convert_to_trans_prec(allow_hp, yc);
  289|       |
  290|   805k|  res.as_mv.row = ty;
  291|   805k|  res.as_mv.col = tx;
  292|       |
  293|   805k|  if (is_integer) {
  ------------------
  |  Branch (293:7): [True: 134k, False: 671k]
  ------------------
  294|   134k|    integer_mv_precision(&res.as_mv);
  295|   134k|  }
  296|   805k|  return res;
  297|   805k|}
mvref_common.c:integer_mv_precision:
  199|   402k|static inline void integer_mv_precision(MV *mv) {
  200|   402k|  int mod = (mv->row % 8);
  201|   402k|  if (mod != 0) {
  ------------------
  |  Branch (201:7): [True: 111k, False: 291k]
  ------------------
  202|   111k|    mv->row -= mod;
  203|   111k|    if (abs(mod) > 4) {
  ------------------
  |  Branch (203:9): [True: 37.3k, False: 73.9k]
  ------------------
  204|  37.3k|      if (mod > 0) {
  ------------------
  |  Branch (204:11): [True: 18.3k, False: 19.0k]
  ------------------
  205|  18.3k|        mv->row += 8;
  206|  19.0k|      } else {
  207|  19.0k|        mv->row -= 8;
  208|  19.0k|      }
  209|  37.3k|    }
  210|   111k|  }
  211|       |
  212|   402k|  mod = (mv->col % 8);
  213|   402k|  if (mod != 0) {
  ------------------
  |  Branch (213:7): [True: 107k, False: 295k]
  ------------------
  214|   107k|    mv->col -= mod;
  215|   107k|    if (abs(mod) > 4) {
  ------------------
  |  Branch (215:9): [True: 29.9k, False: 77.2k]
  ------------------
  216|  29.9k|      if (mod > 0) {
  ------------------
  |  Branch (216:11): [True: 13.7k, False: 16.2k]
  ------------------
  217|  13.7k|        mv->col += 8;
  218|  16.2k|      } else {
  219|  16.2k|        mv->col -= 8;
  220|  16.2k|      }
  221|  29.9k|    }
  222|   107k|  }
  223|   402k|}
mvref_common.c:block_center_x:
  183|   805k|static inline int block_center_x(int mi_col, BLOCK_SIZE bs) {
  184|   805k|  const int bw = block_size_wide[bs];
  185|   805k|  return mi_col * MI_SIZE + bw / 2 - 1;
  ------------------
  |  |   40|   805k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   805k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  186|   805k|}
mvref_common.c:block_center_y:
  188|   805k|static inline int block_center_y(int mi_row, BLOCK_SIZE bs) {
  189|   805k|  const int bh = block_size_high[bs];
  190|   805k|  return mi_row * MI_SIZE + bh / 2 - 1;
  ------------------
  |  |   40|   805k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   805k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  191|   805k|}
mvref_common.c:convert_to_trans_prec:
  193|  1.61M|static inline int convert_to_trans_prec(int allow_hp, int coor) {
  194|  1.61M|  if (allow_hp)
  ------------------
  |  Branch (194:7): [True: 300k, False: 1.31M]
  ------------------
  195|   300k|    return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 3);
  ------------------
  |  |   45|   300k|  (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   41|   126k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (45:4): [True: 126k, False: 173k]
  |  |  ------------------
  |  |   46|   300k|                 : ROUND_POWER_OF_TWO((value), (n)))
  |  |  ------------------
  |  |  |  |   41|   173k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  196|  1.31M|  else
  197|  1.31M|    return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 2) * 2;
  ------------------
  |  |   45|  1.31M|  (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   41|   971k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (45:4): [True: 971k, False: 339k]
  |  |  ------------------
  |  |   46|  1.31M|                 : ROUND_POWER_OF_TWO((value), (n)))
  |  |  ------------------
  |  |  |  |   41|   339k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  198|  1.61M|}
mvref_common.c:clamp_mv:
  323|  13.3M|static inline void clamp_mv(MV *mv, const SubpelMvLimits *mv_limits) {
  324|  13.3M|  mv->col = clamp(mv->col, mv_limits->col_min, mv_limits->col_max);
  325|  13.3M|  mv->row = clamp(mv->row, mv_limits->row_min, mv_limits->row_max);
  326|  13.3M|}

av1_get_mv_projection:
   27|  18.4M|void av1_get_mv_projection(MV *output, MV ref, int num, int den) {
   28|  18.4M|  den = AOMMIN(den, MAX_FRAME_DISTANCE);
  ------------------
  |  |   34|  18.4M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 18.3M, False: 20.9k]
  |  |  ------------------
  ------------------
   29|  18.4M|  num = num > 0 ? AOMMIN(num, MAX_FRAME_DISTANCE)
  ------------------
  |  |   34|  15.3M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 15.2M, False: 14.2k]
  |  |  ------------------
  ------------------
  |  Branch (29:9): [True: 15.3M, False: 3.11M]
  ------------------
   30|  18.4M|                : AOMMAX(num, -MAX_FRAME_DISTANCE);
  ------------------
  |  |   35|  18.4E|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 3.14M, False: 18.4E]
  |  |  ------------------
  ------------------
   31|  18.4M|  const int mv_row =
   32|  18.4M|      ROUND_POWER_OF_TWO_SIGNED(ref.row * num * div_mult[den], 14);
  ------------------
  |  |   45|  18.4M|  (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   41|  5.61M|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (45:4): [True: 5.61M, False: 12.7M]
  |  |  ------------------
  |  |   46|  18.4M|                 : ROUND_POWER_OF_TWO((value), (n)))
  |  |  ------------------
  |  |  |  |   41|  12.7M|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
   33|  18.4M|  const int mv_col =
   34|  18.4M|      ROUND_POWER_OF_TWO_SIGNED(ref.col * num * div_mult[den], 14);
  ------------------
  |  |   45|  18.4M|  (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   41|  2.68M|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (45:4): [True: 2.68M, False: 15.7M]
  |  |  ------------------
  |  |   46|  18.4M|                 : ROUND_POWER_OF_TWO((value), (n)))
  |  |  ------------------
  |  |  |  |   41|  15.7M|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
   35|  18.4M|  const int clamp_max = MV_UPP - 1;
  ------------------
  |  |   75|  18.4M|#define MV_UPP (1 << MV_IN_USE_BITS)
  |  |  ------------------
  |  |  |  |   74|  18.4M|#define MV_IN_USE_BITS 14
  |  |  ------------------
  ------------------
   36|  18.4M|  const int clamp_min = MV_LOW + 1;
  ------------------
  |  |   76|  18.4M|#define MV_LOW (-(1 << MV_IN_USE_BITS))
  |  |  ------------------
  |  |  |  |   74|  18.4M|#define MV_IN_USE_BITS 14
  |  |  ------------------
  ------------------
   37|  18.4M|  output->row = (int16_t)clamp(mv_row, clamp_min, clamp_max);
   38|  18.4M|  output->col = (int16_t)clamp(mv_col, clamp_min, clamp_max);
   39|  18.4M|}
av1_copy_frame_mvs:
   43|  6.25M|                        int x_mis, int y_mis) {
   44|  6.25M|  const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, 1);
  ------------------
  |  |   41|  6.25M|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
   45|  6.25M|  MV_REF *frame_mvs =
   46|  6.25M|      cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1);
   47|  6.25M|  x_mis = ROUND_POWER_OF_TWO(x_mis, 1);
  ------------------
  |  |   41|  6.25M|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
   48|  6.25M|  y_mis = ROUND_POWER_OF_TWO(y_mis, 1);
  ------------------
  |  |   41|  6.25M|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
   49|  6.25M|  int w, h;
   50|       |
   51|  19.3M|  for (h = 0; h < y_mis; h++) {
  ------------------
  |  Branch (51:15): [True: 13.0M, False: 6.25M]
  ------------------
   52|  13.0M|    MV_REF *mv = frame_mvs;
   53|  72.0M|    for (w = 0; w < x_mis; w++) {
  ------------------
  |  Branch (53:17): [True: 58.9M, False: 13.0M]
  ------------------
   54|  58.9M|      mv->ref_frame = NONE_FRAME;
   55|  58.9M|      mv->mv.as_int = 0;
   56|       |
   57|   176M|      for (int idx = 0; idx < 2; ++idx) {
  ------------------
  |  Branch (57:25): [True: 117M, False: 58.9M]
  ------------------
   58|   117M|        MV_REFERENCE_FRAME ref_frame = mi->ref_frame[idx];
   59|   117M|        if (ref_frame > INTRA_FRAME) {
  ------------------
  |  Branch (59:13): [True: 75.2M, False: 42.4M]
  ------------------
   60|  75.2M|          int8_t ref_idx = cm->ref_frame_side[ref_frame];
   61|  75.2M|          if (ref_idx) continue;
  ------------------
  |  Branch (61:15): [True: 13.2M, False: 62.0M]
  ------------------
   62|  62.0M|          if ((abs(mi->mv[idx].as_mv.row) > REFMVS_LIMIT) ||
  ------------------
  |  |   27|  62.0M|#define REFMVS_LIMIT ((1 << 12) - 1)
  ------------------
  |  Branch (62:15): [True: 1.77M, False: 60.2M]
  ------------------
   63|  62.0M|              (abs(mi->mv[idx].as_mv.col) > REFMVS_LIMIT))
  ------------------
  |  |   27|  60.2M|#define REFMVS_LIMIT ((1 << 12) - 1)
  ------------------
  |  Branch (63:15): [True: 592k, False: 59.6M]
  ------------------
   64|  2.51M|            continue;
   65|  59.4M|          mv->ref_frame = ref_frame;
   66|  59.4M|          mv->mv.as_int = mi->mv[idx].as_int;
   67|  59.4M|        }
   68|   117M|      }
   69|  58.9M|      mv++;
   70|  58.9M|    }
   71|  13.0M|    frame_mvs += frame_mvs_stride;
   72|  13.0M|  }
   73|  6.25M|}
av1_find_mv_refs:
  794|  4.61M|                      int_mv *global_mvs, int16_t *mode_context) {
  795|  4.61M|  const int mi_row = xd->mi_row;
  796|  4.61M|  const int mi_col = xd->mi_col;
  797|  4.61M|  int_mv gm_mv[2];
  798|       |
  799|  4.61M|  if (ref_frame == INTRA_FRAME) {
  ------------------
  |  Branch (799:7): [True: 60.1k, False: 4.55M]
  ------------------
  800|  60.1k|    gm_mv[0].as_int = gm_mv[1].as_int = 0;
  801|  60.1k|    if (global_mvs != NULL) {
  ------------------
  |  Branch (801:9): [True: 0, False: 60.1k]
  ------------------
  802|      0|      global_mvs[ref_frame].as_int = INVALID_MV;
  ------------------
  |  |   26|      0|#define INVALID_MV 0x80008000
  ------------------
  803|      0|    }
  804|  4.55M|  } else {
  805|  4.55M|    const BLOCK_SIZE bsize = mi->bsize;
  806|  4.55M|    const int allow_high_precision_mv = cm->features.allow_high_precision_mv;
  807|  4.55M|    const int force_integer_mv = cm->features.cur_frame_force_integer_mv;
  808|  4.55M|    if (ref_frame < REF_FRAMES) {
  ------------------
  |  Branch (808:9): [True: 3.92M, False: 632k]
  ------------------
  809|  3.92M|      gm_mv[0] = gm_get_motion_vector(&cm->global_motion[ref_frame],
  810|  3.92M|                                      allow_high_precision_mv, bsize, mi_col,
  811|  3.92M|                                      mi_row, force_integer_mv);
  812|  3.92M|      gm_mv[1].as_int = 0;
  813|  3.92M|      if (global_mvs != NULL) global_mvs[ref_frame] = gm_mv[0];
  ------------------
  |  Branch (813:11): [True: 0, False: 3.92M]
  ------------------
  814|  3.92M|    } else {
  815|   632k|      MV_REFERENCE_FRAME rf[2];
  816|   632k|      av1_set_ref_frame(rf, ref_frame);
  817|   632k|      gm_mv[0] = gm_get_motion_vector(&cm->global_motion[rf[0]],
  818|   632k|                                      allow_high_precision_mv, bsize, mi_col,
  819|   632k|                                      mi_row, force_integer_mv);
  820|   632k|      gm_mv[1] = gm_get_motion_vector(&cm->global_motion[rf[1]],
  821|   632k|                                      allow_high_precision_mv, bsize, mi_col,
  822|   632k|                                      mi_row, force_integer_mv);
  823|   632k|    }
  824|  4.55M|  }
  825|       |
  826|  4.61M|  setup_ref_mv_list(cm, xd, ref_frame, &ref_mv_count[ref_frame],
  827|  4.61M|                    ref_mv_stack[ref_frame], ref_mv_weight[ref_frame],
  828|  4.61M|                    mv_ref_list ? mv_ref_list[ref_frame] : NULL, gm_mv, mi_row,
  ------------------
  |  Branch (828:21): [True: 4.61M, False: 18.4E]
  ------------------
  829|  4.61M|                    mi_col, mode_context);
  830|  4.61M|}
av1_find_best_ref_mvs:
  833|  3.18M|                           int_mv *near_mv, int is_integer) {
  834|  3.18M|  int i;
  835|       |  // Make sure all the candidates are properly clamped etc
  836|  9.56M|  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
  ------------------
  |  |  508|  9.56M|#define MAX_MV_REF_CANDIDATES 2
  ------------------
  |  Branch (836:15): [True: 6.37M, False: 3.18M]
  ------------------
  837|  6.37M|    lower_mv_precision(&mvlist[i].as_mv, allow_hp, is_integer);
  838|  6.37M|  }
  839|  3.18M|  *nearest_mv = mvlist[0];
  840|  3.18M|  *near_mv = mvlist[1];
  841|  3.18M|}
av1_setup_frame_buf_refs:
  843|   202k|void av1_setup_frame_buf_refs(AV1_COMMON *cm) {
  844|   202k|  cm->cur_frame->order_hint = cm->current_frame.order_hint;
  845|   202k|  cm->cur_frame->display_order_hint = cm->current_frame.display_order_hint;
  846|   202k|  cm->cur_frame->pyramid_level = cm->current_frame.pyramid_level;
  847|   202k|  cm->cur_frame->filter_level[0] = -1;
  848|   202k|  cm->cur_frame->filter_level[1] = -1;
  849|   202k|  MV_REFERENCE_FRAME ref_frame;
  850|  1.62M|  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
  ------------------
  |  Branch (850:32): [True: 1.41M, False: 202k]
  ------------------
  851|  1.41M|    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
  852|  1.41M|    if (buf != NULL) {
  ------------------
  |  Branch (852:9): [True: 553k, False: 866k]
  ------------------
  853|   553k|      cm->cur_frame->ref_order_hints[ref_frame - LAST_FRAME] = buf->order_hint;
  854|   553k|      cm->cur_frame->ref_display_order_hint[ref_frame - LAST_FRAME] =
  855|   553k|          buf->display_order_hint;
  856|   553k|    }
  857|  1.41M|  }
  858|   202k|}
av1_setup_frame_sign_bias:
  860|   202k|void av1_setup_frame_sign_bias(AV1_COMMON *cm) {
  861|   202k|  MV_REFERENCE_FRAME ref_frame;
  862|  1.62M|  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
  ------------------
  |  Branch (862:32): [True: 1.41M, False: 202k]
  ------------------
  863|  1.41M|    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
  864|  1.41M|    if (cm->seq_params->order_hint_info.enable_order_hint && buf != NULL) {
  ------------------
  |  Branch (864:9): [True: 1.02M, False: 393k]
  |  Branch (864:62): [True: 553k, False: 472k]
  ------------------
  865|   553k|      const int ref_order_hint = buf->order_hint;
  866|   553k|      cm->ref_frame_sign_bias[ref_frame] =
  867|   553k|          (get_relative_dist(&cm->seq_params->order_hint_info, ref_order_hint,
  ------------------
  |  Branch (867:11): [True: 367k, False: 186k]
  ------------------
  868|   553k|                             (int)cm->current_frame.order_hint) <= 0)
  869|   553k|              ? 0
  870|   553k|              : 1;
  871|   866k|    } else {
  872|   866k|      cm->ref_frame_sign_bias[ref_frame] = 0;
  873|   866k|    }
  874|  1.41M|  }
  875|   202k|}
av1_calculate_ref_frame_side:
  993|   175k|void av1_calculate_ref_frame_side(AV1_COMMON *cm) {
  994|   175k|  const OrderHintInfo *const order_hint_info = &cm->seq_params->order_hint_info;
  995|       |
  996|   175k|  memset(cm->ref_frame_side, 0, sizeof(cm->ref_frame_side));
  997|   175k|  if (!order_hint_info->enable_order_hint) return;
  ------------------
  |  Branch (997:7): [True: 50.5k, False: 125k]
  ------------------
  998|       |
  999|   125k|  const int cur_order_hint = cm->cur_frame->order_hint;
 1000|       |
 1001|  1.00M|  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
  ------------------
  |  Branch (1001:36): [True: 875k, False: 125k]
  ------------------
 1002|   875k|    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
 1003|   875k|    int order_hint = 0;
 1004|       |
 1005|   875k|    if (buf != NULL) order_hint = buf->order_hint;
  ------------------
  |  Branch (1005:9): [True: 450k, False: 424k]
  ------------------
 1006|       |
 1007|   875k|    if (get_relative_dist(order_hint_info, order_hint, cur_order_hint) > 0)
  ------------------
  |  Branch (1007:9): [True: 252k, False: 623k]
  ------------------
 1008|   252k|      cm->ref_frame_side[ref_frame] = 1;
 1009|   623k|    else if (order_hint == cur_order_hint)
  ------------------
  |  Branch (1009:14): [True: 194k, False: 428k]
  ------------------
 1010|   194k|      cm->ref_frame_side[ref_frame] = -1;
 1011|   875k|  }
 1012|   125k|}
av1_setup_motion_field:
 1014|  15.9k|void av1_setup_motion_field(AV1_COMMON *cm) {
 1015|  15.9k|  const OrderHintInfo *const order_hint_info = &cm->seq_params->order_hint_info;
 1016|       |
 1017|  15.9k|  if (!order_hint_info->enable_order_hint) return;
  ------------------
  |  Branch (1017:7): [True: 0, False: 15.9k]
  ------------------
 1018|       |
 1019|  15.9k|  TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
 1020|  15.9k|  int size = ((cm->mi_params.mi_rows + MAX_MIB_SIZE) >> 1) *
  ------------------
  |  |   44|  15.9k|#define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   43|  15.9k|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|  15.9k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   39|  15.9k|#define MI_SIZE_LOG2 2
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1021|  15.9k|             (cm->mi_params.mi_stride >> 1);
 1022|  56.6M|  for (int idx = 0; idx < size; ++idx) {
  ------------------
  |  Branch (1022:21): [True: 56.6M, False: 15.9k]
  ------------------
 1023|  56.6M|    tpl_mvs_base[idx].mfmv0.as_int = INVALID_MV;
  ------------------
  |  |   26|  56.6M|#define INVALID_MV 0x80008000
  ------------------
 1024|  56.6M|    tpl_mvs_base[idx].ref_frame_offset = 0;
 1025|  56.6M|  }
 1026|       |
 1027|  15.9k|  const int cur_order_hint = cm->cur_frame->order_hint;
 1028|  15.9k|  const RefCntBuffer *ref_buf[INTER_REFS_PER_FRAME];
 1029|  15.9k|  int ref_order_hint[INTER_REFS_PER_FRAME];
 1030|       |
 1031|   127k|  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
  ------------------
  |  Branch (1031:36): [True: 111k, False: 15.9k]
  ------------------
 1032|   111k|    const int ref_idx = ref_frame - LAST_FRAME;
 1033|   111k|    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
 1034|   111k|    int order_hint = 0;
 1035|       |
 1036|   111k|    if (buf != NULL) order_hint = buf->order_hint;
  ------------------
  |  Branch (1036:9): [True: 111k, False: 0]
  ------------------
 1037|       |
 1038|   111k|    ref_buf[ref_idx] = buf;
 1039|   111k|    ref_order_hint[ref_idx] = order_hint;
 1040|   111k|  }
 1041|       |
 1042|  15.9k|  int ref_stamp = MFMV_STACK_SIZE - 1;
  ------------------
  |  |  105|  15.9k|#define MFMV_STACK_SIZE 3
  ------------------
 1043|       |
 1044|  15.9k|  if (ref_buf[LAST_FRAME - LAST_FRAME] != NULL) {
  ------------------
  |  Branch (1044:7): [True: 15.9k, False: 0]
  ------------------
 1045|  15.9k|    const int alt_of_lst_order_hint =
 1046|  15.9k|        ref_buf[LAST_FRAME - LAST_FRAME]
 1047|  15.9k|            ->ref_order_hints[ALTREF_FRAME - LAST_FRAME];
 1048|       |
 1049|  15.9k|    const int is_lst_overlay =
 1050|  15.9k|        (alt_of_lst_order_hint == ref_order_hint[GOLDEN_FRAME - LAST_FRAME]);
 1051|  15.9k|    if (!is_lst_overlay) motion_field_projection(cm, LAST_FRAME, 2);
  ------------------
  |  Branch (1051:9): [True: 12.6k, False: 3.23k]
  ------------------
 1052|  15.9k|    --ref_stamp;
 1053|  15.9k|  }
 1054|       |
 1055|  15.9k|  if (get_relative_dist(order_hint_info,
  ------------------
  |  Branch (1055:7): [True: 5.73k, False: 10.1k]
  ------------------
 1056|  15.9k|                        ref_order_hint[BWDREF_FRAME - LAST_FRAME],
 1057|  15.9k|                        cur_order_hint) > 0) {
 1058|  5.73k|    if (motion_field_projection(cm, BWDREF_FRAME, 0)) --ref_stamp;
  ------------------
  |  Branch (1058:9): [True: 2.00k, False: 3.72k]
  ------------------
 1059|  5.73k|  }
 1060|       |
 1061|  15.9k|  if (get_relative_dist(order_hint_info,
  ------------------
  |  Branch (1061:7): [True: 5.56k, False: 10.3k]
  ------------------
 1062|  15.9k|                        ref_order_hint[ALTREF2_FRAME - LAST_FRAME],
 1063|  15.9k|                        cur_order_hint) > 0) {
 1064|  5.56k|    if (motion_field_projection(cm, ALTREF2_FRAME, 0)) --ref_stamp;
  ------------------
  |  Branch (1064:9): [True: 1.81k, False: 3.75k]
  ------------------
 1065|  5.56k|  }
 1066|       |
 1067|  15.9k|  if (get_relative_dist(order_hint_info,
  ------------------
  |  Branch (1067:7): [True: 6.96k, False: 8.96k]
  ------------------
 1068|  15.9k|                        ref_order_hint[ALTREF_FRAME - LAST_FRAME],
 1069|  15.9k|                        cur_order_hint) > 0 &&
 1070|  15.9k|      ref_stamp >= 0)
  ------------------
  |  Branch (1070:7): [True: 6.32k, False: 644]
  ------------------
 1071|  6.32k|    if (motion_field_projection(cm, ALTREF_FRAME, 0)) --ref_stamp;
  ------------------
  |  Branch (1071:9): [True: 1.16k, False: 5.15k]
  ------------------
 1072|       |
 1073|  15.9k|  if (ref_stamp >= 0) motion_field_projection(cm, LAST2_FRAME, 2);
  ------------------
  |  Branch (1073:7): [True: 14.7k, False: 1.22k]
  ------------------
 1074|  15.9k|}
av1_selectSamples:
 1092|   239k|                          BLOCK_SIZE bsize) {
 1093|   239k|  const int bw = block_size_wide[bsize];
 1094|   239k|  const int bh = block_size_high[bsize];
 1095|   239k|  const int thresh = clamp(AOMMAX(bw, bh), 16, 112);
  ------------------
  |  |   35|   239k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 42.7k, False: 196k]
  |  |  ------------------
  ------------------
 1096|   239k|  uint8_t ret = 0;
 1097|   239k|  assert(len <= LEAST_SQUARES_SAMPLES_MAX);
 1098|       |
 1099|       |  // Only keep the samples with MV differences within threshold.
 1100|   975k|  for (int i = 0; i < len; ++i) {
  ------------------
  |  Branch (1100:19): [True: 736k, False: 239k]
  ------------------
 1101|   736k|    const int diff = abs(pts_inref[2 * i] - pts[2 * i] - mv->col) +
 1102|   736k|                     abs(pts_inref[2 * i + 1] - pts[2 * i + 1] - mv->row);
 1103|   736k|    if (diff > thresh) continue;
  ------------------
  |  Branch (1103:9): [True: 282k, False: 453k]
  ------------------
 1104|   453k|    if (ret != i) {
  ------------------
  |  Branch (1104:9): [True: 76.0k, False: 377k]
  ------------------
 1105|  76.0k|      memcpy(pts + 2 * ret, pts + 2 * i, 2 * sizeof(pts[0]));
 1106|  76.0k|      memcpy(pts_inref + 2 * ret, pts_inref + 2 * i, 2 * sizeof(pts_inref[0]));
 1107|  76.0k|    }
 1108|   453k|    ++ret;
 1109|   453k|  }
 1110|       |  // Keep at least 1 sample.
 1111|   239k|  return AOMMAX(ret, 1);
  ------------------
  |  |   35|   239k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 141k, False: 97.6k]
  |  |  ------------------
  ------------------
 1112|   239k|}
av1_findSamples:
 1118|  2.39M|                        int *pts_inref) {
 1119|  2.39M|  const MB_MODE_INFO *const mbmi0 = xd->mi[0];
 1120|  2.39M|  const int ref_frame = mbmi0->ref_frame[0];
 1121|  2.39M|  const int up_available = xd->up_available;
 1122|  2.39M|  const int left_available = xd->left_available;
 1123|  2.39M|  uint8_t np = 0;
 1124|  2.39M|  int do_tl = 1;
 1125|  2.39M|  int do_tr = 1;
 1126|  2.39M|  const int mi_stride = xd->mi_stride;
 1127|  2.39M|  const int mi_row = xd->mi_row;
 1128|  2.39M|  const int mi_col = xd->mi_col;
 1129|       |
 1130|       |  // scan the nearest above rows
 1131|  2.39M|  if (up_available) {
  ------------------
  |  Branch (1131:7): [True: 2.28M, False: 103k]
  ------------------
 1132|  2.28M|    const int mi_row_offset = -1;
 1133|  2.28M|    const MB_MODE_INFO *mbmi = xd->mi[mi_row_offset * mi_stride];
 1134|  2.28M|    uint8_t superblock_width = mi_size_wide[mbmi->bsize];
 1135|       |
 1136|  2.28M|    if (xd->width <= superblock_width) {
  ------------------
  |  Branch (1136:9): [True: 1.93M, False: 348k]
  ------------------
 1137|       |      // Handle "current block width <= above block width" case.
 1138|  1.93M|      const int col_offset = -mi_col % superblock_width;
 1139|       |
 1140|  1.93M|      if (col_offset < 0) do_tl = 0;
  ------------------
  |  Branch (1140:11): [True: 334k, False: 1.60M]
  ------------------
 1141|  1.93M|      if (col_offset + superblock_width > xd->width) do_tr = 0;
  ------------------
  |  Branch (1141:11): [True: 342k, False: 1.59M]
  ------------------
 1142|       |
 1143|  1.93M|      if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
  ------------------
  |  Branch (1143:11): [True: 1.50M, False: 437k]
  |  Branch (1143:46): [True: 1.27M, False: 231k]
  ------------------
 1144|  1.27M|        record_samples(mbmi, pts, pts_inref, 0, -1, col_offset, 1);
 1145|  1.27M|        pts += 2;
 1146|  1.27M|        pts_inref += 2;
 1147|  1.27M|        if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
  ------------------
  |  |   29|  1.27M|#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
  |  |  ------------------
  |  |  |  |   28|  1.27M|#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
  |  |  ------------------
  ------------------
                      if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
  ------------------
  |  |   29|      0|#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
  |  |  ------------------
  |  |  |  |   28|      0|#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
  |  |  ------------------
  ------------------
  |  Branch (1147:13): [True: 0, False: 1.27M]
  ------------------
 1148|  1.27M|      }
 1149|  1.93M|    } else {
 1150|       |      // Handle "current block width > above block width" case.
 1151|  1.22M|      for (int i = 0; i < AOMMIN(xd->width, cm->mi_params.mi_cols - mi_col);
  ------------------
  |  |   34|  1.22M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.19M, False: 31.4k]
  |  |  ------------------
  ------------------
  |  Branch (1151:23): [True: 878k, False: 347k]
  ------------------
 1152|   878k|           i += superblock_width) {
 1153|   878k|        mbmi = xd->mi[i + mi_row_offset * mi_stride];
 1154|   878k|        superblock_width = mi_size_wide[mbmi->bsize];
 1155|       |
 1156|   878k|        if (mbmi->ref_frame[0] == ref_frame &&
  ------------------
  |  Branch (1156:13): [True: 641k, False: 236k]
  ------------------
 1157|   878k|            mbmi->ref_frame[1] == NONE_FRAME) {
  ------------------
  |  Branch (1157:13): [True: 577k, False: 64.0k]
  ------------------
 1158|   577k|          record_samples(mbmi, pts, pts_inref, 0, -1, i, 1);
 1159|   577k|          pts += 2;
 1160|   577k|          pts_inref += 2;
 1161|   577k|          if (++np >= LEAST_SQUARES_SAMPLES_MAX)
  ------------------
  |  |   29|   577k|#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
  |  |  ------------------
  |  |  |  |   28|   577k|#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
  |  |  ------------------
  ------------------
  |  Branch (1161:15): [True: 1.32k, False: 575k]
  ------------------
 1162|  1.32k|            return LEAST_SQUARES_SAMPLES_MAX;
  ------------------
  |  |   29|  1.32k|#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
  |  |  ------------------
  |  |  |  |   28|  1.32k|#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
  |  |  ------------------
  ------------------
 1163|   577k|        }
 1164|   878k|      }
 1165|   348k|    }
 1166|  2.28M|  }
 1167|  2.39M|  assert(np <= LEAST_SQUARES_SAMPLES_MAX);
 1168|       |
 1169|       |  // scan the nearest left columns
 1170|  2.39M|  if (left_available) {
  ------------------
  |  Branch (1170:7): [True: 2.32M, False: 67.9k]
  ------------------
 1171|  2.32M|    const int mi_col_offset = -1;
 1172|  2.32M|    const MB_MODE_INFO *mbmi = xd->mi[mi_col_offset];
 1173|  2.32M|    uint8_t superblock_height = mi_size_high[mbmi->bsize];
 1174|       |
 1175|  2.32M|    if (xd->height <= superblock_height) {
  ------------------
  |  Branch (1175:9): [True: 1.94M, False: 378k]
  ------------------
 1176|       |      // Handle "current block height <= above block height" case.
 1177|  1.94M|      const int row_offset = -mi_row % superblock_height;
 1178|       |
 1179|  1.94M|      if (row_offset < 0) do_tl = 0;
  ------------------
  |  Branch (1179:11): [True: 368k, False: 1.57M]
  ------------------
 1180|       |
 1181|  1.94M|      if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
  ------------------
  |  Branch (1181:11): [True: 1.50M, False: 436k]
  |  Branch (1181:46): [True: 1.26M, False: 246k]
  ------------------
 1182|  1.26M|        record_samples(mbmi, pts, pts_inref, row_offset, 1, 0, -1);
 1183|  1.26M|        pts += 2;
 1184|  1.26M|        pts_inref += 2;
 1185|  1.26M|        np++;
 1186|  1.26M|        if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
  ------------------
  |  |   29|  1.26M|#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
  |  |  ------------------
  |  |  |  |   28|  1.26M|#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
  |  |  ------------------
  ------------------
                      if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
  ------------------
  |  |   29|    438|#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
  |  |  ------------------
  |  |  |  |   28|    438|#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
  |  |  ------------------
  ------------------
  |  Branch (1186:13): [True: 438, False: 1.26M]
  ------------------
 1187|  1.26M|      }
 1188|  1.94M|    } else {
 1189|       |      // Handle "current block height > above block height" case.
 1190|  1.31M|      for (int i = 0; i < AOMMIN(xd->height, cm->mi_params.mi_rows - mi_row);
  ------------------
  |  |   34|  1.31M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.29M, False: 22.8k]
  |  |  ------------------
  ------------------
  |  Branch (1190:23): [True: 945k, False: 373k]
  ------------------
 1191|   945k|           i += superblock_height) {
 1192|   945k|        mbmi = xd->mi[mi_col_offset + i * mi_stride];
 1193|   945k|        superblock_height = mi_size_high[mbmi->bsize];
 1194|       |
 1195|   945k|        if (mbmi->ref_frame[0] == ref_frame &&
  ------------------
  |  Branch (1195:13): [True: 697k, False: 248k]
  ------------------
 1196|   945k|            mbmi->ref_frame[1] == NONE_FRAME) {
  ------------------
  |  Branch (1196:13): [True: 637k, False: 59.3k]
  ------------------
 1197|   637k|          record_samples(mbmi, pts, pts_inref, i, 1, 0, -1);
 1198|   637k|          pts += 2;
 1199|   637k|          pts_inref += 2;
 1200|   637k|          if (++np >= LEAST_SQUARES_SAMPLES_MAX)
  ------------------
  |  |   29|   637k|#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
  |  |  ------------------
  |  |  |  |   28|   637k|#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
  |  |  ------------------
  ------------------
  |  Branch (1200:15): [True: 5.17k, False: 632k]
  ------------------
 1201|  5.17k|            return LEAST_SQUARES_SAMPLES_MAX;
  ------------------
  |  |   29|  5.17k|#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
  |  |  ------------------
  |  |  |  |   28|  5.17k|#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
  |  |  ------------------
  ------------------
 1202|   637k|        }
 1203|   945k|      }
 1204|   378k|    }
 1205|  2.32M|  }
 1206|  2.38M|  assert(np <= LEAST_SQUARES_SAMPLES_MAX);
 1207|       |
 1208|       |  // Top-left block
 1209|  2.38M|  if (do_tl && left_available && up_available) {
  ------------------
  |  Branch (1209:7): [True: 1.68M, False: 702k]
  |  Branch (1209:16): [True: 1.61M, False: 67.9k]
  |  Branch (1209:34): [True: 1.53M, False: 82.8k]
  ------------------
 1210|  1.53M|    const int mi_row_offset = -1;
 1211|  1.53M|    const int mi_col_offset = -1;
 1212|  1.53M|    MB_MODE_INFO *mbmi = xd->mi[mi_col_offset + mi_row_offset * mi_stride];
 1213|       |
 1214|  1.53M|    if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
  ------------------
  |  Branch (1214:9): [True: 1.13M, False: 397k]
  |  Branch (1214:44): [True: 964k, False: 170k]
  ------------------
 1215|   964k|      record_samples(mbmi, pts, pts_inref, 0, -1, 0, -1);
 1216|   964k|      pts += 2;
 1217|   964k|      pts_inref += 2;
 1218|   964k|      if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
  ------------------
  |  |   29|   964k|#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
  |  |  ------------------
  |  |  |  |   28|   964k|#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
  |  |  ------------------
  ------------------
                    if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
  ------------------
  |  |   29|  3.98k|#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
  |  |  ------------------
  |  |  |  |   28|  3.98k|#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
  |  |  ------------------
  ------------------
  |  Branch (1218:11): [True: 3.98k, False: 960k]
  ------------------
 1219|   964k|    }
 1220|  1.53M|  }
 1221|  2.38M|  assert(np <= LEAST_SQUARES_SAMPLES_MAX);
 1222|       |
 1223|       |  // Top-right block
 1224|  2.38M|  if (do_tr &&
  ------------------
  |  Branch (1224:7): [True: 2.03M, False: 341k]
  ------------------
 1225|  2.38M|      has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->width, xd->height))) {
  ------------------
  |  |   35|  2.03M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 678k, False: 1.36M]
  |  |  ------------------
  ------------------
  |  Branch (1225:7): [True: 1.04M, False: 999k]
  ------------------
 1226|  1.04M|    const POSITION trb_pos = { -1, xd->width };
 1227|  1.04M|    const TileInfo *const tile = &xd->tile;
 1228|  1.04M|    if (is_inside(tile, mi_col, mi_row, &trb_pos)) {
  ------------------
  |  Branch (1228:9): [True: 950k, False: 89.3k]
  ------------------
 1229|   950k|      const int mi_row_offset = -1;
 1230|   950k|      const int mi_col_offset = xd->width;
 1231|   950k|      const MB_MODE_INFO *mbmi =
 1232|   950k|          xd->mi[mi_col_offset + mi_row_offset * mi_stride];
 1233|       |
 1234|   950k|      if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
  ------------------
  |  Branch (1234:11): [True: 673k, False: 277k]
  |  Branch (1234:46): [True: 571k, False: 101k]
  ------------------
 1235|   571k|        record_samples(mbmi, pts, pts_inref, 0, -1, xd->width, 1);
 1236|   571k|        if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
  ------------------
  |  |   29|   571k|#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
  |  |  ------------------
  |  |  |  |   28|   571k|#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
  |  |  ------------------
  ------------------
                      if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
  ------------------
  |  |   29|  4.71k|#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
  |  |  ------------------
  |  |  |  |   28|  4.71k|#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
  |  |  ------------------
  ------------------
  |  Branch (1236:13): [True: 4.71k, False: 567k]
  ------------------
 1237|   571k|      }
 1238|   950k|    }
 1239|  1.04M|  }
 1240|  2.37M|  assert(np <= LEAST_SQUARES_SAMPLES_MAX);
 1241|       |
 1242|  2.37M|  return np;
 1243|  2.37M|}
av1_setup_skip_mode_allowed:
 1245|   181k|void av1_setup_skip_mode_allowed(AV1_COMMON *cm) {
 1246|   181k|  const OrderHintInfo *const order_hint_info = &cm->seq_params->order_hint_info;
 1247|   181k|  SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info;
 1248|       |
 1249|   181k|  skip_mode_info->skip_mode_allowed = 0;
 1250|   181k|  skip_mode_info->ref_frame_idx_0 = INVALID_IDX;
  ------------------
  |  |   15|   181k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
 1251|   181k|  skip_mode_info->ref_frame_idx_1 = INVALID_IDX;
  ------------------
  |  |   15|   181k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
 1252|       |
 1253|   181k|  if (!order_hint_info->enable_order_hint || frame_is_intra_only(cm) ||
  ------------------
  |  Branch (1253:7): [True: 51.6k, False: 130k]
  |  Branch (1253:46): [True: 84.1k, False: 46.0k]
  ------------------
 1254|   181k|      cm->current_frame.reference_mode == SINGLE_REFERENCE)
  ------------------
  |  Branch (1254:7): [True: 19.8k, False: 26.2k]
  ------------------
 1255|   155k|    return;
 1256|       |
 1257|  26.2k|  const int cur_order_hint = cm->current_frame.order_hint;
 1258|  26.2k|  int ref_order_hints[2] = { -1, INT_MAX };
 1259|  26.2k|  int ref_idx[2] = { INVALID_IDX, INVALID_IDX };
  ------------------
  |  |   15|  26.2k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
                int ref_idx[2] = { INVALID_IDX, INVALID_IDX };
  ------------------
  |  |   15|  26.2k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
 1260|       |
 1261|       |  // Identify the nearest forward and backward references.
 1262|   209k|  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
  ------------------
  |  Branch (1262:19): [True: 183k, False: 26.2k]
  ------------------
 1263|   183k|    const RefCntBuffer *const buf = get_ref_frame_buf(cm, LAST_FRAME + i);
 1264|   183k|    if (buf == NULL) continue;
  ------------------
  |  Branch (1264:9): [True: 0, False: 183k]
  ------------------
 1265|       |
 1266|   183k|    const int ref_order_hint = buf->order_hint;
 1267|   183k|    if (get_relative_dist(order_hint_info, ref_order_hint, cur_order_hint) <
  ------------------
  |  Branch (1267:9): [True: 130k, False: 52.6k]
  ------------------
 1268|   183k|        0) {
 1269|       |      // Forward reference
 1270|   130k|      if (ref_order_hints[0] == -1 ||
  ------------------
  |  Branch (1270:11): [True: 25.2k, False: 105k]
  ------------------
 1271|   130k|          get_relative_dist(order_hint_info, ref_order_hint,
  ------------------
  |  Branch (1271:11): [True: 18.8k, False: 86.7k]
  ------------------
 1272|   105k|                            ref_order_hints[0]) > 0) {
 1273|  44.1k|        ref_order_hints[0] = ref_order_hint;
 1274|  44.1k|        ref_idx[0] = i;
 1275|  44.1k|      }
 1276|   130k|    } else if (get_relative_dist(order_hint_info, ref_order_hint,
  ------------------
  |  Branch (1276:16): [True: 23.2k, False: 29.4k]
  ------------------
 1277|  52.6k|                                 cur_order_hint) > 0) {
 1278|       |      // Backward reference
 1279|  23.2k|      if (ref_order_hints[1] == INT_MAX ||
  ------------------
  |  Branch (1279:11): [True: 9.99k, False: 13.2k]
  ------------------
 1280|  23.2k|          get_relative_dist(order_hint_info, ref_order_hint,
  ------------------
  |  Branch (1280:11): [True: 1.13k, False: 12.0k]
  ------------------
 1281|  13.2k|                            ref_order_hints[1]) < 0) {
 1282|  11.1k|        ref_order_hints[1] = ref_order_hint;
 1283|  11.1k|        ref_idx[1] = i;
 1284|  11.1k|      }
 1285|  23.2k|    }
 1286|   183k|  }
 1287|       |
 1288|  26.2k|  if (ref_idx[0] != INVALID_IDX && ref_idx[1] != INVALID_IDX) {
  ------------------
  |  |   15|  52.4k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
                if (ref_idx[0] != INVALID_IDX && ref_idx[1] != INVALID_IDX) {
  ------------------
  |  |   15|  25.2k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  |  Branch (1288:7): [True: 25.2k, False: 926]
  |  Branch (1288:36): [True: 9.09k, False: 16.2k]
  ------------------
 1289|       |    // == Bi-directional prediction ==
 1290|  9.09k|    skip_mode_info->skip_mode_allowed = 1;
 1291|  9.09k|    skip_mode_info->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]);
  ------------------
  |  |   34|  9.09k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 8.10k, False: 991]
  |  |  ------------------
  ------------------
 1292|  9.09k|    skip_mode_info->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]);
  ------------------
  |  |   35|  9.09k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 991, False: 8.10k]
  |  |  ------------------
  ------------------
 1293|  17.1k|  } else if (ref_idx[0] != INVALID_IDX && ref_idx[1] == INVALID_IDX) {
  ------------------
  |  |   15|  34.2k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
                } else if (ref_idx[0] != INVALID_IDX && ref_idx[1] == INVALID_IDX) {
  ------------------
  |  |   15|  16.2k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  |  Branch (1293:14): [True: 16.2k, False: 926]
  |  Branch (1293:43): [True: 16.2k, False: 0]
  ------------------
 1294|       |    // == Forward prediction only ==
 1295|       |    // Identify the second nearest forward reference.
 1296|  16.2k|    ref_order_hints[1] = -1;
 1297|   129k|    for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
  ------------------
  |  Branch (1297:21): [True: 113k, False: 16.2k]
  ------------------
 1298|   113k|      const RefCntBuffer *const buf = get_ref_frame_buf(cm, LAST_FRAME + i);
 1299|   113k|      if (buf == NULL) continue;
  ------------------
  |  Branch (1299:11): [True: 0, False: 113k]
  ------------------
 1300|       |
 1301|   113k|      const int ref_order_hint = buf->order_hint;
 1302|   113k|      if ((ref_order_hints[0] != -1 &&
  ------------------
  |  Branch (1302:12): [True: 113k, False: 0]
  ------------------
 1303|   113k|           get_relative_dist(order_hint_info, ref_order_hint,
  ------------------
  |  Branch (1303:12): [True: 38.5k, False: 74.8k]
  ------------------
 1304|   113k|                             ref_order_hints[0]) < 0) &&
 1305|   113k|          (ref_order_hints[1] == -1 ||
  ------------------
  |  Branch (1305:12): [True: 14.3k, False: 24.2k]
  ------------------
 1306|  38.5k|           get_relative_dist(order_hint_info, ref_order_hint,
  ------------------
  |  Branch (1306:12): [True: 6.92k, False: 17.3k]
  ------------------
 1307|  24.2k|                             ref_order_hints[1]) > 0)) {
 1308|       |        // Second closest forward reference
 1309|  21.2k|        ref_order_hints[1] = ref_order_hint;
 1310|  21.2k|        ref_idx[1] = i;
 1311|  21.2k|      }
 1312|   113k|    }
 1313|  16.2k|    if (ref_order_hints[1] != -1) {
  ------------------
  |  Branch (1313:9): [True: 14.3k, False: 1.89k]
  ------------------
 1314|  14.3k|      skip_mode_info->skip_mode_allowed = 1;
 1315|  14.3k|      skip_mode_info->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]);
  ------------------
  |  |   34|  14.3k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 7.35k, False: 6.95k]
  |  |  ------------------
  ------------------
 1316|  14.3k|      skip_mode_info->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]);
  ------------------
  |  |   35|  14.3k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 6.95k, False: 7.35k]
  |  |  ------------------
  ------------------
 1317|  14.3k|    }
 1318|  16.2k|  }
 1319|  26.2k|}
av1_set_frame_refs:
 1346|  25.5k|                        int lst_map_idx, int gld_map_idx) {
 1347|  25.5k|  int lst_frame_sort_idx = -1;
 1348|  25.5k|  int gld_frame_sort_idx = -1;
 1349|       |
 1350|  25.5k|  assert(cm->seq_params->order_hint_info.enable_order_hint);
 1351|  25.5k|  assert(cm->seq_params->order_hint_info.order_hint_bits_minus_1 >= 0);
 1352|  25.5k|  const int cur_order_hint = (int)cm->current_frame.order_hint;
 1353|  25.5k|  const int cur_frame_sort_idx =
 1354|  25.5k|      1 << cm->seq_params->order_hint_info.order_hint_bits_minus_1;
 1355|       |
 1356|  25.5k|  REF_FRAME_INFO ref_frame_info[REF_FRAMES];
 1357|  25.5k|  int ref_flag_list[INTER_REFS_PER_FRAME] = { 0, 0, 0, 0, 0, 0, 0 };
 1358|       |
 1359|   229k|  for (int i = 0; i < REF_FRAMES; ++i) {
  ------------------
  |  Branch (1359:19): [True: 204k, False: 25.5k]
  ------------------
 1360|   204k|    const int map_idx = i;
 1361|       |
 1362|   204k|    ref_frame_info[i].map_idx = map_idx;
 1363|   204k|    ref_frame_info[i].sort_idx = -1;
 1364|       |
 1365|   204k|    RefCntBuffer *const buf = cm->ref_frame_map[map_idx];
 1366|   204k|    ref_frame_info[i].buf = buf;
 1367|       |
 1368|   204k|    if (buf == NULL) continue;
  ------------------
  |  Branch (1368:9): [True: 13.9k, False: 190k]
  ------------------
 1369|       |    // If this assertion fails, there is a reference leak.
 1370|   190k|    assert(buf->ref_count > 0);
 1371|       |
 1372|   190k|    const int offset = (int)buf->order_hint;
 1373|   190k|    ref_frame_info[i].sort_idx =
 1374|   190k|        (offset == -1) ? -1
  ------------------
  |  Branch (1374:9): [True: 0, False: 190k]
  ------------------
 1375|   190k|                       : cur_frame_sort_idx +
 1376|   190k|                             get_relative_dist(&cm->seq_params->order_hint_info,
 1377|   190k|                                               offset, cur_order_hint);
 1378|   190k|    assert(ref_frame_info[i].sort_idx >= -1);
 1379|       |
 1380|   190k|    if (map_idx == lst_map_idx) lst_frame_sort_idx = ref_frame_info[i].sort_idx;
  ------------------
  |  Branch (1380:9): [True: 25.5k, False: 164k]
  ------------------
 1381|   190k|    if (map_idx == gld_map_idx) gld_frame_sort_idx = ref_frame_info[i].sort_idx;
  ------------------
  |  Branch (1381:9): [True: 25.5k, False: 164k]
  ------------------
 1382|   190k|  }
 1383|       |
 1384|       |  // Confirm both LAST_FRAME and GOLDEN_FRAME are valid forward reference
 1385|       |  // frames.
 1386|  25.5k|  if (lst_frame_sort_idx == -1 || lst_frame_sort_idx >= cur_frame_sort_idx) {
  ------------------
  |  Branch (1386:7): [True: 0, False: 25.5k]
  |  Branch (1386:35): [True: 271, False: 25.2k]
  ------------------
 1387|    271|    aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
 1388|    271|                       "Inter frame requests a look-ahead frame as LAST");
 1389|    271|  }
 1390|  25.5k|  if (gld_frame_sort_idx == -1 || gld_frame_sort_idx >= cur_frame_sort_idx) {
  ------------------
  |  Branch (1390:7): [True: 271, False: 25.2k]
  |  Branch (1390:35): [True: 164, False: 25.1k]
  ------------------
 1391|    164|    aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
 1392|    164|                       "Inter frame requests a look-ahead frame as GOLDEN");
 1393|    164|  }
 1394|       |
 1395|       |  // Sort ref frames based on their frame_offset values.
 1396|  25.5k|  qsort(ref_frame_info, REF_FRAMES, sizeof(REF_FRAME_INFO),
 1397|  25.5k|        compare_ref_frame_info);
 1398|       |
 1399|       |  // Identify forward and backward reference frames.
 1400|       |  // Forward  reference: offset < order_hint
 1401|       |  // Backward reference: offset >= order_hint
 1402|  25.5k|  int fwd_start_idx = 0, fwd_end_idx = REF_FRAMES - 1;
 1403|       |
 1404|   178k|  for (int i = 0; i < REF_FRAMES; i++) {
  ------------------
  |  Branch (1404:19): [True: 173k, False: 4.62k]
  ------------------
 1405|   173k|    if (ref_frame_info[i].sort_idx == -1) {
  ------------------
  |  Branch (1405:9): [True: 13.8k, False: 159k]
  ------------------
 1406|  13.8k|      fwd_start_idx++;
 1407|  13.8k|      continue;
 1408|  13.8k|    }
 1409|       |
 1410|   159k|    if (ref_frame_info[i].sort_idx >= cur_frame_sort_idx) {
  ------------------
  |  Branch (1410:9): [True: 20.9k, False: 138k]
  ------------------
 1411|  20.9k|      fwd_end_idx = i - 1;
 1412|  20.9k|      break;
 1413|  20.9k|    }
 1414|   159k|  }
 1415|       |
 1416|  25.5k|  int bwd_start_idx = fwd_end_idx + 1;
 1417|  25.5k|  int bwd_end_idx = REF_FRAMES - 1;
 1418|       |
 1419|       |  // === Backward Reference Frames ===
 1420|       |
 1421|       |  // == ALTREF_FRAME ==
 1422|  25.5k|  if (bwd_start_idx <= bwd_end_idx) {
  ------------------
  |  Branch (1422:7): [True: 20.9k, False: 4.62k]
  ------------------
 1423|  20.9k|    set_ref_frame_info(remapped_ref_idx, ALTREF_FRAME - LAST_FRAME,
 1424|  20.9k|                       &ref_frame_info[bwd_end_idx]);
 1425|  20.9k|    ref_flag_list[ALTREF_FRAME - LAST_FRAME] = 1;
 1426|  20.9k|    bwd_end_idx--;
 1427|  20.9k|  }
 1428|       |
 1429|       |  // == BWDREF_FRAME ==
 1430|  25.5k|  if (bwd_start_idx <= bwd_end_idx) {
  ------------------
  |  Branch (1430:7): [True: 13.5k, False: 11.9k]
  ------------------
 1431|  13.5k|    set_ref_frame_info(remapped_ref_idx, BWDREF_FRAME - LAST_FRAME,
 1432|  13.5k|                       &ref_frame_info[bwd_start_idx]);
 1433|  13.5k|    ref_flag_list[BWDREF_FRAME - LAST_FRAME] = 1;
 1434|  13.5k|    bwd_start_idx++;
 1435|  13.5k|  }
 1436|       |
 1437|       |  // == ALTREF2_FRAME ==
 1438|  25.5k|  if (bwd_start_idx <= bwd_end_idx) {
  ------------------
  |  Branch (1438:7): [True: 7.03k, False: 18.5k]
  ------------------
 1439|  7.03k|    set_ref_frame_info(remapped_ref_idx, ALTREF2_FRAME - LAST_FRAME,
 1440|  7.03k|                       &ref_frame_info[bwd_start_idx]);
 1441|  7.03k|    ref_flag_list[ALTREF2_FRAME - LAST_FRAME] = 1;
 1442|  7.03k|  }
 1443|       |
 1444|       |  // === Forward Reference Frames ===
 1445|       |
 1446|   164k|  for (int i = fwd_start_idx; i <= fwd_end_idx; ++i) {
  ------------------
  |  Branch (1446:31): [True: 138k, False: 25.5k]
  ------------------
 1447|       |    // == LAST_FRAME ==
 1448|   138k|    if (ref_frame_info[i].map_idx == lst_map_idx) {
  ------------------
  |  Branch (1448:9): [True: 25.1k, False: 113k]
  ------------------
 1449|  25.1k|      set_ref_frame_info(remapped_ref_idx, LAST_FRAME - LAST_FRAME,
 1450|  25.1k|                         &ref_frame_info[i]);
 1451|  25.1k|      ref_flag_list[LAST_FRAME - LAST_FRAME] = 1;
 1452|  25.1k|    }
 1453|       |
 1454|       |    // == GOLDEN_FRAME ==
 1455|   138k|    if (ref_frame_info[i].map_idx == gld_map_idx) {
  ------------------
  |  Branch (1455:9): [True: 25.1k, False: 113k]
  ------------------
 1456|  25.1k|      set_ref_frame_info(remapped_ref_idx, GOLDEN_FRAME - LAST_FRAME,
 1457|  25.1k|                         &ref_frame_info[i]);
 1458|  25.1k|      ref_flag_list[GOLDEN_FRAME - LAST_FRAME] = 1;
 1459|  25.1k|    }
 1460|   138k|  }
 1461|       |
 1462|  25.5k|  assert(ref_flag_list[LAST_FRAME - LAST_FRAME] == 1 &&
 1463|  25.5k|         ref_flag_list[GOLDEN_FRAME - LAST_FRAME] == 1);
 1464|       |
 1465|       |  // == LAST2_FRAME ==
 1466|       |  // == LAST3_FRAME ==
 1467|       |  // == BWDREF_FRAME ==
 1468|       |  // == ALTREF2_FRAME ==
 1469|       |  // == ALTREF_FRAME ==
 1470|       |
 1471|       |  // Set up the reference frames in the anti-chronological order.
 1472|  25.1k|  static const MV_REFERENCE_FRAME ref_frame_list[INTER_REFS_PER_FRAME - 2] = {
 1473|  25.1k|    LAST2_FRAME, LAST3_FRAME, BWDREF_FRAME, ALTREF2_FRAME, ALTREF_FRAME
 1474|  25.1k|  };
 1475|       |
 1476|  25.1k|  int ref_idx;
 1477|   128k|  for (ref_idx = 0; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) {
  ------------------
  |  Branch (1477:21): [True: 109k, False: 18.8k]
  ------------------
 1478|   109k|    const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx];
 1479|       |
 1480|   109k|    if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue;
  ------------------
  |  Branch (1480:9): [True: 30.7k, False: 78.7k]
  ------------------
 1481|       |
 1482|   104k|    while (fwd_start_idx <= fwd_end_idx &&
  ------------------
  |  Branch (1482:12): [True: 97.7k, False: 6.30k]
  ------------------
 1483|   104k|           (ref_frame_info[fwd_end_idx].map_idx == lst_map_idx ||
  ------------------
  |  Branch (1483:13): [True: 10.5k, False: 87.1k]
  ------------------
 1484|  97.7k|            ref_frame_info[fwd_end_idx].map_idx == gld_map_idx)) {
  ------------------
  |  Branch (1484:13): [True: 14.7k, False: 72.4k]
  ------------------
 1485|  25.3k|      fwd_end_idx--;
 1486|  25.3k|    }
 1487|  78.7k|    if (fwd_start_idx > fwd_end_idx) break;
  ------------------
  |  Branch (1487:9): [True: 6.30k, False: 72.4k]
  ------------------
 1488|       |
 1489|  72.4k|    set_ref_frame_info(remapped_ref_idx, ref_frame - LAST_FRAME,
 1490|  72.4k|                       &ref_frame_info[fwd_end_idx]);
 1491|  72.4k|    ref_flag_list[ref_frame - LAST_FRAME] = 1;
 1492|       |
 1493|  72.4k|    fwd_end_idx--;
 1494|  72.4k|  }
 1495|       |
 1496|       |  // Assign all the remaining frame(s), if any, to the earliest reference
 1497|       |  // frame.
 1498|  47.5k|  for (; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) {
  ------------------
  |  Branch (1498:10): [True: 22.4k, False: 25.1k]
  ------------------
 1499|  22.4k|    const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx];
 1500|  22.4k|    if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue;
  ------------------
  |  Branch (1500:9): [True: 10.7k, False: 11.6k]
  ------------------
 1501|  11.6k|    set_ref_frame_info(remapped_ref_idx, ref_frame - LAST_FRAME,
 1502|  11.6k|                       &ref_frame_info[fwd_start_idx]);
 1503|  11.6k|    ref_flag_list[ref_frame - LAST_FRAME] = 1;
 1504|  11.6k|  }
 1505|       |
 1506|   200k|  for (int i = 0; i < INTER_REFS_PER_FRAME; i++) {
  ------------------
  |  Branch (1506:19): [True: 175k, False: 25.1k]
  ------------------
 1507|   175k|    assert(ref_flag_list[i] == 1);
 1508|   175k|  }
 1509|  25.1k|}
mvref_common.c:setup_ref_mv_list:
  485|  4.61M|    int mi_row, int mi_col, int16_t *mode_context) {
  486|  4.61M|  const int bs = AOMMAX(xd->width, xd->height);
  ------------------
  |  |   35|  4.61M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 1.57M, False: 3.04M]
  |  |  ------------------
  ------------------
  487|  4.61M|  const int has_tr = has_top_right(cm, xd, mi_row, mi_col, bs);
  488|  4.61M|  MV_REFERENCE_FRAME rf[2];
  489|       |
  490|  4.61M|  const TileInfo *const tile = &xd->tile;
  491|  4.61M|  int max_row_offset = 0, max_col_offset = 0;
  492|  4.61M|  const int row_adj = (xd->height < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01);
  ------------------
  |  Branch (492:23): [True: 992k, False: 3.62M]
  |  Branch (492:65): [True: 495k, False: 496k]
  ------------------
  493|  4.61M|  const int col_adj = (xd->width < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01);
  ------------------
  |  Branch (493:23): [True: 830k, False: 3.78M]
  |  Branch (493:64): [True: 415k, False: 415k]
  ------------------
  494|  4.61M|  int processed_rows = 0;
  495|  4.61M|  int processed_cols = 0;
  496|       |
  497|  4.61M|  av1_set_ref_frame(rf, ref_frame);
  498|  4.61M|  mode_context[ref_frame] = 0;
  499|  4.61M|  *refmv_count = 0;
  500|       |
  501|       |  // Find valid maximum row/col offset.
  502|  4.61M|  if (xd->up_available) {
  ------------------
  |  Branch (502:7): [True: 4.39M, False: 222k]
  ------------------
  503|  4.39M|    max_row_offset = -(MVREF_ROW_COLS << 1) + row_adj;
  ------------------
  |  |   21|  4.39M|#define MVREF_ROW_COLS 3
  ------------------
  504|       |
  505|  4.39M|    if (xd->height < mi_size_high[BLOCK_8X8])
  ------------------
  |  Branch (505:9): [True: 986k, False: 3.40M]
  ------------------
  506|   986k|      max_row_offset = -(2 << 1) + row_adj;
  507|       |
  508|  4.39M|    max_row_offset = find_valid_row_offset(tile, mi_row, max_row_offset);
  509|  4.39M|  }
  510|       |
  511|  4.61M|  if (xd->left_available) {
  ------------------
  |  Branch (511:7): [True: 4.47M, False: 141k]
  ------------------
  512|  4.47M|    max_col_offset = -(MVREF_ROW_COLS << 1) + col_adj;
  ------------------
  |  |   21|  4.47M|#define MVREF_ROW_COLS 3
  ------------------
  513|       |
  514|  4.47M|    if (xd->width < mi_size_wide[BLOCK_8X8])
  ------------------
  |  Branch (514:9): [True: 827k, False: 3.64M]
  ------------------
  515|   827k|      max_col_offset = -(2 << 1) + col_adj;
  516|       |
  517|  4.47M|    max_col_offset = find_valid_col_offset(tile, mi_col, max_col_offset);
  518|  4.47M|  }
  519|       |
  520|  4.61M|  uint8_t col_match_count = 0;
  521|  4.61M|  uint8_t row_match_count = 0;
  522|  4.61M|  uint8_t newmv_count = 0;
  523|       |
  524|       |  // Scan the first above row mode info. row_offset = -1;
  525|  4.61M|  if (abs(max_row_offset) >= 1)
  ------------------
  |  Branch (525:7): [True: 4.39M, False: 222k]
  ------------------
  526|  4.39M|    scan_row_mbmi(cm, xd, mi_col, rf, -1, ref_mv_stack, ref_mv_weight,
  527|  4.39M|                  refmv_count, &row_match_count, &newmv_count, gm_mv_candidates,
  528|  4.39M|                  max_row_offset, &processed_rows);
  529|       |  // Scan the first left column mode info. col_offset = -1;
  530|  4.61M|  if (abs(max_col_offset) >= 1)
  ------------------
  |  Branch (530:7): [True: 4.47M, False: 140k]
  ------------------
  531|  4.47M|    scan_col_mbmi(cm, xd, mi_row, rf, -1, ref_mv_stack, ref_mv_weight,
  532|  4.47M|                  refmv_count, &col_match_count, &newmv_count, gm_mv_candidates,
  533|  4.47M|                  max_col_offset, &processed_cols);
  534|       |  // Check top-right boundary
  535|  4.61M|  if (has_tr)
  ------------------
  |  Branch (535:7): [True: 2.58M, False: 2.03M]
  ------------------
  536|  2.58M|    scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->width, ref_mv_stack,
  537|  2.58M|                  ref_mv_weight, &row_match_count, &newmv_count,
  538|  2.58M|                  gm_mv_candidates, refmv_count);
  539|       |
  540|  4.61M|  const uint8_t nearest_match = (row_match_count > 0) + (col_match_count > 0);
  541|  4.61M|  const uint8_t nearest_refmv_count = *refmv_count;
  542|       |
  543|       |  // TODO(yunqing): for comp_search, do it for all 3 cases.
  544|  11.1M|  for (int idx = 0; idx < nearest_refmv_count; ++idx)
  ------------------
  |  Branch (544:21): [True: 6.50M, False: 4.61M]
  ------------------
  545|  6.50M|    ref_mv_weight[idx] += REF_CAT_LEVEL;
  ------------------
  |  |  512|  6.50M|#define REF_CAT_LEVEL 640
  ------------------
  546|       |
  547|  4.61M|  if (cm->features.allow_ref_frame_mvs) {
  ------------------
  |  Branch (547:7): [True: 3.68M, False: 934k]
  ------------------
  548|  3.68M|    int is_available = 0;
  549|  3.68M|    const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->height);
  ------------------
  |  |   35|  3.68M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 772k, False: 2.90M]
  |  |  ------------------
  ------------------
  550|  3.68M|    const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->width);
  ------------------
  |  |   35|  3.68M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 653k, False: 3.02M]
  |  |  ------------------
  ------------------
  551|  3.68M|    const int blk_row_end = AOMMIN(xd->height, mi_size_high[BLOCK_64X64]);
  ------------------
  |  |   34|  3.68M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 3.50M, False: 176k]
  |  |  ------------------
  ------------------
  552|  3.68M|    const int blk_col_end = AOMMIN(xd->width, mi_size_wide[BLOCK_64X64]);
  ------------------
  |  |   34|  3.68M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 3.49M, False: 186k]
  |  |  ------------------
  ------------------
  553|       |
  554|  3.68M|    const int tpl_sample_pos[3][2] = {
  555|  3.68M|      { voffset, -2 },
  556|  3.68M|      { voffset, hoffset },
  557|  3.68M|      { voffset - 2, hoffset },
  558|  3.68M|    };
  559|  3.68M|    const int allow_extension = (xd->height >= mi_size_high[BLOCK_8X8]) &&
  ------------------
  |  Branch (559:33): [True: 2.90M, False: 772k]
  ------------------
  560|  3.68M|                                (xd->height < mi_size_high[BLOCK_64X64]) &&
  ------------------
  |  Branch (560:33): [True: 2.73M, False: 176k]
  ------------------
  561|  3.68M|                                (xd->width >= mi_size_wide[BLOCK_8X8]) &&
  ------------------
  |  Branch (561:33): [True: 2.26M, False: 465k]
  ------------------
  562|  3.68M|                                (xd->width < mi_size_wide[BLOCK_64X64]);
  ------------------
  |  Branch (562:33): [True: 2.21M, False: 55.9k]
  ------------------
  563|       |
  564|  3.68M|    const int step_h = (xd->height >= mi_size_high[BLOCK_64X64])
  ------------------
  |  Branch (564:24): [True: 176k, False: 3.50M]
  ------------------
  565|  3.68M|                           ? mi_size_high[BLOCK_16X16]
  566|  3.68M|                           : mi_size_high[BLOCK_8X8];
  567|  3.68M|    const int step_w = (xd->width >= mi_size_wide[BLOCK_64X64])
  ------------------
  |  Branch (567:24): [True: 186k, False: 3.49M]
  ------------------
  568|  3.68M|                           ? mi_size_wide[BLOCK_16X16]
  569|  3.68M|                           : mi_size_wide[BLOCK_8X8];
  570|       |
  571|  9.99M|    for (int blk_row = 0; blk_row < blk_row_end; blk_row += step_h) {
  ------------------
  |  Branch (571:27): [True: 6.31M, False: 3.68M]
  ------------------
  572|  19.7M|      for (int blk_col = 0; blk_col < blk_col_end; blk_col += step_w) {
  ------------------
  |  Branch (572:29): [True: 13.4M, False: 6.31M]
  ------------------
  573|  13.4M|        int ret = add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row,
  574|  13.4M|                                 blk_col, gm_mv_candidates, refmv_count,
  575|  13.4M|                                 ref_mv_stack, ref_mv_weight, mode_context);
  576|  13.4M|        if (blk_row == 0 && blk_col == 0) is_available = ret;
  ------------------
  |  Branch (576:13): [True: 6.79M, False: 6.67M]
  |  Branch (576:29): [True: 3.68M, False: 3.11M]
  ------------------
  577|  13.4M|      }
  578|  6.31M|    }
  579|       |
  580|  3.68M|    if (is_available == 0) mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
  ------------------
  |  |  487|  3.47M|#define GLOBALMV_OFFSET 3
  ------------------
  |  Branch (580:9): [True: 3.47M, False: 207k]
  ------------------
  581|       |
  582|  10.3M|    for (int i = 0; i < 3 && allow_extension; ++i) {
  ------------------
  |  Branch (582:21): [True: 8.10M, False: 2.21M]
  |  Branch (582:30): [True: 6.63M, False: 1.46M]
  ------------------
  583|  6.63M|      const int blk_row = tpl_sample_pos[i][0];
  584|  6.63M|      const int blk_col = tpl_sample_pos[i][1];
  585|       |
  586|  6.63M|      if (!check_sb_border(mi_row, mi_col, blk_row, blk_col)) continue;
  ------------------
  |  Branch (586:11): [True: 2.35M, False: 4.27M]
  ------------------
  587|  4.27M|      add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row, blk_col,
  588|  4.27M|                     gm_mv_candidates, refmv_count, ref_mv_stack, ref_mv_weight,
  589|  4.27M|                     mode_context);
  590|  4.27M|    }
  591|  3.68M|  }
  592|       |
  593|  4.61M|  uint8_t dummy_newmv_count = 0;
  594|       |
  595|       |  // Scan the second outer area.
  596|  4.61M|  scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, -1, ref_mv_stack, ref_mv_weight,
  597|  4.61M|                &row_match_count, &dummy_newmv_count, gm_mv_candidates,
  598|  4.61M|                refmv_count);
  599|       |
  600|  13.8M|  for (int idx = 2; idx <= MVREF_ROW_COLS; ++idx) {
  ------------------
  |  |   21|  13.8M|#define MVREF_ROW_COLS 3
  ------------------
  |  Branch (600:21): [True: 9.23M, False: 4.61M]
  ------------------
  601|  9.23M|    const int row_offset = -(idx << 1) + 1 + row_adj;
  602|  9.23M|    const int col_offset = -(idx << 1) + 1 + col_adj;
  603|       |
  604|  9.23M|    if (abs(row_offset) <= abs(max_row_offset) &&
  ------------------
  |  Branch (604:9): [True: 7.69M, False: 1.53M]
  ------------------
  605|  9.23M|        abs(row_offset) > processed_rows)
  ------------------
  |  Branch (605:9): [True: 5.45M, False: 2.24M]
  ------------------
  606|  5.45M|      scan_row_mbmi(cm, xd, mi_col, rf, row_offset, ref_mv_stack, ref_mv_weight,
  607|  5.45M|                    refmv_count, &row_match_count, &dummy_newmv_count,
  608|  5.45M|                    gm_mv_candidates, max_row_offset, &processed_rows);
  609|       |
  610|  9.23M|    if (abs(col_offset) <= abs(max_col_offset) &&
  ------------------
  |  Branch (610:9): [True: 8.03M, False: 1.19M]
  ------------------
  611|  9.23M|        abs(col_offset) > processed_cols)
  ------------------
  |  Branch (611:9): [True: 5.43M, False: 2.60M]
  ------------------
  612|  5.43M|      scan_col_mbmi(cm, xd, mi_row, rf, col_offset, ref_mv_stack, ref_mv_weight,
  613|  5.43M|                    refmv_count, &col_match_count, &dummy_newmv_count,
  614|  5.43M|                    gm_mv_candidates, max_col_offset, &processed_cols);
  615|  9.23M|  }
  616|       |
  617|  4.61M|  const uint8_t ref_match_count = (row_match_count > 0) + (col_match_count > 0);
  618|       |
  619|  4.61M|  switch (nearest_match) {
  620|   572k|    case 0:
  ------------------
  |  Branch (620:5): [True: 572k, False: 4.04M]
  ------------------
  621|   572k|      if (ref_match_count >= 1) mode_context[ref_frame] |= 1;
  ------------------
  |  Branch (621:11): [True: 139k, False: 433k]
  ------------------
  622|   572k|      if (ref_match_count == 1)
  ------------------
  |  Branch (622:11): [True: 115k, False: 457k]
  ------------------
  623|   115k|        mode_context[ref_frame] |= (1 << REFMV_OFFSET);
  ------------------
  |  |  488|   115k|#define REFMV_OFFSET 4
  ------------------
  624|   457k|      else if (ref_match_count >= 2)
  ------------------
  |  Branch (624:16): [True: 24.1k, False: 433k]
  ------------------
  625|  24.1k|        mode_context[ref_frame] |= (2 << REFMV_OFFSET);
  ------------------
  |  |  488|  24.1k|#define REFMV_OFFSET 4
  ------------------
  626|   572k|      break;
  627|  1.32M|    case 1:
  ------------------
  |  Branch (627:5): [True: 1.32M, False: 3.29M]
  ------------------
  628|  1.32M|      mode_context[ref_frame] |= (newmv_count > 0) ? 2 : 3;
  ------------------
  |  Branch (628:34): [True: 613k, False: 710k]
  ------------------
  629|  1.32M|      if (ref_match_count == 1)
  ------------------
  |  Branch (629:11): [True: 832k, False: 491k]
  ------------------
  630|   832k|        mode_context[ref_frame] |= (3 << REFMV_OFFSET);
  ------------------
  |  |  488|   832k|#define REFMV_OFFSET 4
  ------------------
  631|   491k|      else if (ref_match_count >= 2)
  ------------------
  |  Branch (631:16): [True: 491k, False: 18.4E]
  ------------------
  632|   491k|        mode_context[ref_frame] |= (4 << REFMV_OFFSET);
  ------------------
  |  |  488|   491k|#define REFMV_OFFSET 4
  ------------------
  633|  1.32M|      break;
  634|  2.72M|    case 2:
  ------------------
  |  Branch (634:5): [True: 2.72M, False: 1.89M]
  ------------------
  635|  2.72M|    default:
  ------------------
  |  Branch (635:5): [True: 0, False: 4.61M]
  ------------------
  636|  2.72M|      if (newmv_count >= 1)
  ------------------
  |  Branch (636:11): [True: 1.61M, False: 1.10M]
  ------------------
  637|  1.61M|        mode_context[ref_frame] |= 4;
  638|  1.10M|      else
  639|  1.10M|        mode_context[ref_frame] |= 5;
  640|       |
  641|  2.72M|      mode_context[ref_frame] |= (5 << REFMV_OFFSET);
  ------------------
  |  |  488|  2.72M|#define REFMV_OFFSET 4
  ------------------
  642|  2.72M|      break;
  643|  4.61M|  }
  644|       |
  645|       |  // Rank the likelihood and assign nearest and near mvs.
  646|  4.61M|  int len = nearest_refmv_count;
  647|  9.52M|  while (len > 0) {
  ------------------
  |  Branch (647:10): [True: 4.90M, False: 4.61M]
  ------------------
  648|  4.90M|    int nr_len = 0;
  649|  7.76M|    for (int idx = 1; idx < len; ++idx) {
  ------------------
  |  Branch (649:23): [True: 2.86M, False: 4.90M]
  ------------------
  650|  2.86M|      if (ref_mv_weight[idx - 1] < ref_mv_weight[idx]) {
  ------------------
  |  Branch (650:11): [True: 1.02M, False: 1.83M]
  ------------------
  651|  1.02M|        const CANDIDATE_MV tmp_mv = ref_mv_stack[idx - 1];
  652|  1.02M|        const uint16_t tmp_ref_mv_weight = ref_mv_weight[idx - 1];
  653|  1.02M|        ref_mv_stack[idx - 1] = ref_mv_stack[idx];
  654|  1.02M|        ref_mv_stack[idx] = tmp_mv;
  655|  1.02M|        ref_mv_weight[idx - 1] = ref_mv_weight[idx];
  656|  1.02M|        ref_mv_weight[idx] = tmp_ref_mv_weight;
  657|  1.02M|        nr_len = idx;
  658|  1.02M|      }
  659|  2.86M|    }
  660|  4.90M|    len = nr_len;
  661|  4.90M|  }
  662|       |
  663|  4.61M|  len = *refmv_count;
  664|  7.54M|  while (len > nearest_refmv_count) {
  ------------------
  |  Branch (664:10): [True: 2.92M, False: 4.61M]
  ------------------
  665|  2.92M|    int nr_len = nearest_refmv_count;
  666|  5.24M|    for (int idx = nearest_refmv_count + 1; idx < len; ++idx) {
  ------------------
  |  Branch (666:45): [True: 2.31M, False: 2.92M]
  ------------------
  667|  2.31M|      if (ref_mv_weight[idx - 1] < ref_mv_weight[idx]) {
  ------------------
  |  Branch (667:11): [True: 730k, False: 1.58M]
  ------------------
  668|   730k|        const CANDIDATE_MV tmp_mv = ref_mv_stack[idx - 1];
  669|   730k|        const uint16_t tmp_ref_mv_weight = ref_mv_weight[idx - 1];
  670|   730k|        ref_mv_stack[idx - 1] = ref_mv_stack[idx];
  671|   730k|        ref_mv_stack[idx] = tmp_mv;
  672|   730k|        ref_mv_weight[idx - 1] = ref_mv_weight[idx];
  673|   730k|        ref_mv_weight[idx] = tmp_ref_mv_weight;
  674|   730k|        nr_len = idx;
  675|   730k|      }
  676|  2.31M|    }
  677|  2.92M|    len = nr_len;
  678|  2.92M|  }
  679|       |
  680|  4.61M|  int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->width);
  ------------------
  |  |   34|  4.61M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 127k, False: 4.48M]
  |  |  ------------------
  ------------------
  681|  4.61M|  mi_width = AOMMIN(mi_width, cm->mi_params.mi_cols - mi_col);
  ------------------
  |  |   34|  4.61M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 4.45M, False: 163k]
  |  |  ------------------
  ------------------
  682|  4.61M|  int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->height);
  ------------------
  |  |   34|  4.61M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 162k, False: 4.45M]
  |  |  ------------------
  ------------------
  683|  4.61M|  mi_height = AOMMIN(mi_height, cm->mi_params.mi_rows - mi_row);
  ------------------
  |  |   34|  4.61M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 4.53M, False: 79.4k]
  |  |  ------------------
  ------------------
  684|  4.61M|  const int mi_size = AOMMIN(mi_width, mi_height);
  ------------------
  |  |   34|  4.61M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.12M, False: 3.48M]
  |  |  ------------------
  ------------------
  685|  4.61M|  if (rf[1] > NONE_FRAME) {
  ------------------
  |  Branch (685:7): [True: 634k, False: 3.98M]
  ------------------
  686|       |    // TODO(jingning, yunqing): Refactor and consolidate the compound and
  687|       |    // single reference frame modes. Reduce unnecessary redundancy.
  688|   634k|    if (*refmv_count < MAX_MV_REF_CANDIDATES) {
  ------------------
  |  |  508|   634k|#define MAX_MV_REF_CANDIDATES 2
  ------------------
  |  Branch (688:9): [True: 426k, False: 208k]
  ------------------
  689|   426k|      int_mv ref_id[2][2], ref_diff[2][2];
  690|   426k|      int ref_id_count[2] = { 0 }, ref_diff_count[2] = { 0 };
  691|       |
  692|   812k|      for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size;) {
  ------------------
  |  Branch (692:25): [True: 729k, False: 82.4k]
  |  Branch (692:53): [True: 386k, False: 343k]
  ------------------
  693|   386k|        const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
  694|   386k|        process_compound_ref_mv_candidate(
  695|   386k|            candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count);
  696|   386k|        idx += mi_size_wide[candidate->bsize];
  697|   386k|      }
  698|       |
  699|   846k|      for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size;) {
  ------------------
  |  Branch (699:25): [True: 792k, False: 53.7k]
  |  Branch (699:53): [True: 420k, False: 372k]
  ------------------
  700|   420k|        const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
  701|   420k|        process_compound_ref_mv_candidate(
  702|   420k|            candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count);
  703|   420k|        idx += mi_size_high[candidate->bsize];
  704|   420k|      }
  705|       |
  706|       |      // Build up the compound mv predictor
  707|   426k|      int_mv comp_list[MAX_MV_REF_CANDIDATES][2];
  708|       |
  709|  1.27M|      for (int idx = 0; idx < 2; ++idx) {
  ------------------
  |  Branch (709:25): [True: 851k, False: 426k]
  ------------------
  710|   851k|        int comp_idx = 0;
  711|   851k|        for (int list_idx = 0;
  712|  1.73M|             list_idx < ref_id_count[idx] && comp_idx < MAX_MV_REF_CANDIDATES;
  ------------------
  |  |  508|   887k|#define MAX_MV_REF_CANDIDATES 2
  ------------------
  |  Branch (712:14): [True: 887k, False: 851k]
  |  Branch (712:46): [True: 887k, False: 18.4E]
  ------------------
  713|   887k|             ++list_idx, ++comp_idx)
  714|   887k|          comp_list[comp_idx][idx] = ref_id[idx][list_idx];
  715|   851k|        for (int list_idx = 0;
  716|  1.48M|             list_idx < ref_diff_count[idx] && comp_idx < MAX_MV_REF_CANDIDATES;
  ------------------
  |  |  508|  1.02M|#define MAX_MV_REF_CANDIDATES 2
  ------------------
  |  Branch (716:14): [True: 1.02M, False: 460k]
  |  Branch (716:48): [True: 631k, False: 391k]
  ------------------
  717|   851k|             ++list_idx, ++comp_idx)
  718|   631k|          comp_list[comp_idx][idx] = ref_diff[idx][list_idx];
  719|  1.03M|        for (; comp_idx < MAX_MV_REF_CANDIDATES; ++comp_idx)
  ------------------
  |  |  508|  1.03M|#define MAX_MV_REF_CANDIDATES 2
  ------------------
  |  Branch (719:16): [True: 183k, False: 851k]
  ------------------
  720|   183k|          comp_list[comp_idx][idx] = gm_mv_candidates[idx];
  721|   851k|      }
  722|       |
  723|   426k|      if (*refmv_count) {
  ------------------
  |  Branch (723:11): [True: 253k, False: 172k]
  ------------------
  724|   253k|        assert(*refmv_count == 1);
  725|   253k|        if (comp_list[0][0].as_int == ref_mv_stack[0].this_mv.as_int &&
  ------------------
  |  Branch (725:13): [True: 211k, False: 41.6k]
  ------------------
  726|   253k|            comp_list[0][1].as_int == ref_mv_stack[0].comp_mv.as_int) {
  ------------------
  |  Branch (726:13): [True: 197k, False: 14.1k]
  ------------------
  727|   197k|          ref_mv_stack[*refmv_count].this_mv = comp_list[1][0];
  728|   197k|          ref_mv_stack[*refmv_count].comp_mv = comp_list[1][1];
  729|   197k|        } else {
  730|  55.8k|          ref_mv_stack[*refmv_count].this_mv = comp_list[0][0];
  731|  55.8k|          ref_mv_stack[*refmv_count].comp_mv = comp_list[0][1];
  732|  55.8k|        }
  733|   253k|        ref_mv_weight[*refmv_count] = 2;
  734|   253k|        ++*refmv_count;
  735|   253k|      } else {
  736|   517k|        for (int idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx) {
  ------------------
  |  |  508|   517k|#define MAX_MV_REF_CANDIDATES 2
  ------------------
  |  Branch (736:27): [True: 344k, False: 172k]
  ------------------
  737|   344k|          ref_mv_stack[*refmv_count].this_mv = comp_list[idx][0];
  738|   344k|          ref_mv_stack[*refmv_count].comp_mv = comp_list[idx][1];
  739|   344k|          ref_mv_weight[*refmv_count] = 2;
  740|   344k|          ++*refmv_count;
  741|   344k|        }
  742|   172k|      }
  743|   426k|    }
  744|       |
  745|   634k|    assert(*refmv_count >= 2);
  746|       |
  747|  2.09M|    for (int idx = 0; idx < *refmv_count; ++idx) {
  ------------------
  |  Branch (747:23): [True: 1.46M, False: 634k]
  ------------------
  748|  1.46M|      clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->width << MI_SIZE_LOG2,
  ------------------
  |  |   39|  1.46M|#define MI_SIZE_LOG2 2
  ------------------
  749|  1.46M|                   xd->height << MI_SIZE_LOG2, xd);
  ------------------
  |  |   39|  1.46M|#define MI_SIZE_LOG2 2
  ------------------
  750|  1.46M|      clamp_mv_ref(&ref_mv_stack[idx].comp_mv.as_mv, xd->width << MI_SIZE_LOG2,
  ------------------
  |  |   39|  1.46M|#define MI_SIZE_LOG2 2
  ------------------
  751|  1.46M|                   xd->height << MI_SIZE_LOG2, xd);
  ------------------
  |  |   39|  1.46M|#define MI_SIZE_LOG2 2
  ------------------
  752|  1.46M|    }
  753|  3.98M|  } else {
  754|       |    // Handle single reference frame extension
  755|  5.48M|    for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size &&
  ------------------
  |  Branch (755:23): [True: 5.36M, False: 123k]
  |  Branch (755:51): [True: 3.95M, False: 1.40M]
  ------------------
  756|  5.48M|                      *refmv_count < MAX_MV_REF_CANDIDATES;) {
  ------------------
  |  |  508|  3.95M|#define MAX_MV_REF_CANDIDATES 2
  ------------------
  |  Branch (756:23): [True: 1.50M, False: 2.45M]
  ------------------
  757|  1.50M|      const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
  758|  1.50M|      process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
  759|  1.50M|                                      ref_mv_stack, ref_mv_weight);
  760|  1.50M|      idx += mi_size_wide[candidate->bsize];
  761|  1.50M|    }
  762|       |
  763|  5.36M|    for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size &&
  ------------------
  |  Branch (763:23): [True: 5.27M, False: 84.4k]
  |  Branch (763:51): [True: 4.00M, False: 1.27M]
  ------------------
  764|  5.36M|                      *refmv_count < MAX_MV_REF_CANDIDATES;) {
  ------------------
  |  |  508|  4.00M|#define MAX_MV_REF_CANDIDATES 2
  ------------------
  |  Branch (764:23): [True: 1.37M, False: 2.62M]
  ------------------
  765|  1.37M|      const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
  766|  1.37M|      process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
  767|  1.37M|                                      ref_mv_stack, ref_mv_weight);
  768|  1.37M|      idx += mi_size_high[candidate->bsize];
  769|  1.37M|    }
  770|       |
  771|  14.3M|    for (int idx = 0; idx < *refmv_count; ++idx) {
  ------------------
  |  Branch (771:23): [True: 10.3M, False: 3.98M]
  ------------------
  772|  10.3M|      clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->width << MI_SIZE_LOG2,
  ------------------
  |  |   39|  10.3M|#define MI_SIZE_LOG2 2
  ------------------
  773|  10.3M|                   xd->height << MI_SIZE_LOG2, xd);
  ------------------
  |  |   39|  10.3M|#define MI_SIZE_LOG2 2
  ------------------
  774|  10.3M|    }
  775|       |
  776|  3.98M|    if (mv_ref_list != NULL) {
  ------------------
  |  Branch (776:9): [True: 3.98M, False: 18.4E]
  ------------------
  777|  5.30M|      for (int idx = *refmv_count; idx < MAX_MV_REF_CANDIDATES; ++idx)
  ------------------
  |  |  508|  5.30M|#define MAX_MV_REF_CANDIDATES 2
  ------------------
  |  Branch (777:36): [True: 1.31M, False: 3.98M]
  ------------------
  778|  1.31M|        mv_ref_list[idx].as_int = gm_mv_candidates[0].as_int;
  779|       |
  780|  10.6M|      for (int idx = 0; idx < AOMMIN(MAX_MV_REF_CANDIDATES, *refmv_count);
  ------------------
  |  |   34|  10.6M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 5.01M, False: 5.61M]
  |  |  ------------------
  ------------------
  |  Branch (780:25): [True: 6.64M, False: 3.98M]
  ------------------
  781|  6.64M|           ++idx) {
  782|  6.64M|        mv_ref_list[idx].as_int = ref_mv_stack[idx].this_mv.as_int;
  783|  6.64M|      }
  784|  3.98M|    }
  785|  3.98M|  }
  786|  4.61M|}
mvref_common.c:scan_row_mbmi:
  149|  9.84M|                                 int *processed_rows) {
  150|  9.84M|  int end_mi = AOMMIN(xd->width, cm->mi_params.mi_cols - mi_col);
  ------------------
  |  |   34|  9.84M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 9.60M, False: 239k]
  |  |  ------------------
  ------------------
  151|  9.84M|  end_mi = AOMMIN(end_mi, mi_size_wide[BLOCK_64X64]);
  ------------------
  |  |   34|  9.84M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 9.58M, False: 262k]
  |  |  ------------------
  ------------------
  152|  9.84M|  const int width_8x8 = mi_size_wide[BLOCK_8X8];
  153|  9.84M|  const int width_16x16 = mi_size_wide[BLOCK_16X16];
  154|  9.84M|  int col_offset = 0;
  155|       |  // TODO(jingning): Revisit this part after cb4x4 is stable.
  156|  9.84M|  if (abs(row_offset) > 1) {
  ------------------
  |  Branch (156:7): [True: 5.45M, False: 4.39M]
  ------------------
  157|  5.45M|    col_offset = 1;
  158|  5.45M|    if ((mi_col & 0x01) && xd->width < width_8x8) --col_offset;
  ------------------
  |  Branch (158:9): [True: 680k, False: 4.77M]
  |  Branch (158:28): [True: 680k, False: 18.4E]
  ------------------
  159|  5.45M|  }
  160|  9.84M|  const int use_step_16 = (xd->width >= 16);
  161|  9.84M|  MB_MODE_INFO **const candidate_mi0 = xd->mi + row_offset * xd->mi_stride;
  162|       |
  163|  21.3M|  for (int i = 0; i < end_mi;) {
  ------------------
  |  Branch (163:19): [True: 11.4M, False: 9.84M]
  ------------------
  164|  11.4M|    const MB_MODE_INFO *const candidate = candidate_mi0[col_offset + i];
  165|  11.4M|    const int candidate_bsize = candidate->bsize;
  166|  11.4M|    const int n4_w = mi_size_wide[candidate_bsize];
  167|  11.4M|    int len = AOMMIN(xd->width, n4_w);
  ------------------
  |  |   34|  11.4M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 3.63M, False: 7.83M]
  |  |  ------------------
  ------------------
  168|  11.4M|    if (use_step_16)
  ------------------
  |  Branch (168:9): [True: 437k, False: 11.0M]
  ------------------
  169|   437k|      len = AOMMAX(width_16x16, len);
  ------------------
  |  |   35|   437k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 60.2k, False: 376k]
  |  |  ------------------
  ------------------
  170|  11.0M|    else if (abs(row_offset) > 1)
  ------------------
  |  Branch (170:14): [True: 6.23M, False: 4.80M]
  ------------------
  171|  6.23M|      len = AOMMAX(len, width_8x8);
  ------------------
  |  |   35|  6.23M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 1.77M, False: 4.45M]
  |  |  ------------------
  ------------------
  172|       |
  173|  11.4M|    uint16_t weight = 2;
  174|  11.4M|    if (xd->width >= width_8x8 && xd->width <= n4_w) {
  ------------------
  |  Branch (174:9): [True: 9.29M, False: 2.17M]
  |  Branch (174:35): [True: 6.16M, False: 3.13M]
  ------------------
  175|  6.16M|      uint16_t inc = AOMMIN(-max_row_offset + row_offset + 1,
  ------------------
  |  |   34|  6.16M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.64M, False: 4.51M]
  |  |  ------------------
  ------------------
  176|  6.16M|                            mi_size_high[candidate_bsize]);
  177|       |      // Obtain range used in weight calculation.
  178|  6.16M|      weight = AOMMAX(weight, inc);
  ------------------
  |  |   35|  6.16M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 1.12M, False: 5.03M]
  |  |  ------------------
  ------------------
  179|       |      // Update processed rows.
  180|  6.16M|      *processed_rows = inc - row_offset - 1;
  181|  6.16M|    }
  182|       |
  183|  11.4M|    add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
  184|  11.4M|                         newmv_count, ref_mv_stack, ref_mv_weight,
  185|  11.4M|                         gm_mv_candidates, cm->global_motion, len * weight);
  186|       |
  187|  11.4M|    i += len;
  188|  11.4M|  }
  189|  9.84M|}
mvref_common.c:add_ref_mv_candidate:
   80|  29.7M|    uint16_t weight) {
   81|  29.7M|  if (!is_inter_block(candidate)) return;
  ------------------
  |  Branch (81:7): [True: 3.83M, False: 25.9M]
  ------------------
   82|  25.9M|  assert(weight % 2 == 0);
   83|  25.9M|  int index, ref;
   84|       |
   85|  25.9M|  if (rf[1] == NONE_FRAME) {
  ------------------
  |  Branch (85:7): [True: 22.7M, False: 3.18M]
  ------------------
   86|       |    // single reference frame
   87|  68.2M|    for (ref = 0; ref < 2; ++ref) {
  ------------------
  |  Branch (87:19): [True: 45.5M, False: 22.7M]
  ------------------
   88|  45.5M|      if (candidate->ref_frame[ref] == rf[0]) {
  ------------------
  |  Branch (88:11): [True: 19.5M, False: 25.9M]
  ------------------
   89|  19.5M|        const int is_gm_block =
   90|  19.5M|            is_global_mv_block(candidate, gm_params[rf[0]].wmtype);
   91|  19.5M|        const int_mv this_refmv =
   92|  19.5M|            is_gm_block ? gm_mv_candidates[0] : get_block_mv(candidate, ref);
  ------------------
  |  Branch (92:13): [True: 758k, False: 18.8M]
  ------------------
   93|  36.8M|        for (index = 0; index < *refmv_count; ++index) {
  ------------------
  |  Branch (93:25): [True: 27.0M, False: 9.82M]
  ------------------
   94|  27.0M|          if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int) {
  ------------------
  |  Branch (94:15): [True: 9.74M, False: 17.2M]
  ------------------
   95|  9.74M|            ref_mv_weight[index] += weight;
   96|  9.74M|            break;
   97|  9.74M|          }
   98|  27.0M|        }
   99|       |
  100|       |        // Add a new item to the list.
  101|  19.5M|        if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
  ------------------
  |  |  510|  9.82M|#define MAX_REF_MV_STACK_SIZE 8
  ------------------
  |  Branch (101:13): [True: 9.82M, False: 9.74M]
  |  Branch (101:38): [True: 9.75M, False: 71.8k]
  ------------------
  102|  9.75M|          ref_mv_stack[index].this_mv = this_refmv;
  103|  9.75M|          ref_mv_weight[index] = weight;
  104|  9.75M|          ++(*refmv_count);
  105|  9.75M|        }
  106|  19.5M|        if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count;
  ------------------
  |  Branch (106:13): [True: 9.68M, False: 9.88M]
  ------------------
  107|  19.5M|        ++*ref_match_count;
  108|  19.5M|      }
  109|  45.5M|    }
  110|  22.7M|  } else {
  111|       |    // compound reference frame
  112|  3.18M|    if (candidate->ref_frame[0] == rf[0] && candidate->ref_frame[1] == rf[1]) {
  ------------------
  |  Branch (112:9): [True: 2.21M, False: 968k]
  |  Branch (112:45): [True: 1.28M, False: 930k]
  ------------------
  113|  1.28M|      int_mv this_refmv[2];
  114|       |
  115|  3.85M|      for (ref = 0; ref < 2; ++ref) {
  ------------------
  |  Branch (115:21): [True: 2.57M, False: 1.28M]
  ------------------
  116|  2.57M|        if (is_global_mv_block(candidate, gm_params[rf[ref]].wmtype))
  ------------------
  |  Branch (116:13): [True: 54.8k, False: 2.51M]
  ------------------
  117|  54.8k|          this_refmv[ref] = gm_mv_candidates[ref];
  118|  2.51M|        else
  119|  2.51M|          this_refmv[ref] = get_block_mv(candidate, ref);
  120|  2.57M|      }
  121|       |
  122|  1.96M|      for (index = 0; index < *refmv_count; ++index) {
  ------------------
  |  Branch (122:23): [True: 1.20M, False: 757k]
  ------------------
  123|  1.20M|        if ((ref_mv_stack[index].this_mv.as_int == this_refmv[0].as_int) &&
  ------------------
  |  Branch (123:13): [True: 626k, False: 579k]
  ------------------
  124|  1.20M|            (ref_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int)) {
  ------------------
  |  Branch (124:13): [True: 529k, False: 96.5k]
  ------------------
  125|   529k|          ref_mv_weight[index] += weight;
  126|   529k|          break;
  127|   529k|        }
  128|  1.20M|      }
  129|       |
  130|       |      // Add a new item to the list.
  131|  1.28M|      if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
  ------------------
  |  |  510|   757k|#define MAX_REF_MV_STACK_SIZE 8
  ------------------
  |  Branch (131:11): [True: 757k, False: 529k]
  |  Branch (131:36): [True: 756k, False: 954]
  ------------------
  132|   756k|        ref_mv_stack[index].this_mv = this_refmv[0];
  133|   756k|        ref_mv_stack[index].comp_mv = this_refmv[1];
  134|   756k|        ref_mv_weight[index] = weight;
  135|   756k|        ++(*refmv_count);
  136|   756k|      }
  137|  1.28M|      if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count;
  ------------------
  |  Branch (137:11): [True: 398k, False: 888k]
  ------------------
  138|  1.28M|      ++*ref_match_count;
  139|  1.28M|    }
  140|  3.18M|  }
  141|  25.9M|}
mvref_common.c:scan_col_mbmi:
  197|  9.91M|                                 int *processed_cols) {
  198|  9.91M|  int end_mi = AOMMIN(xd->height, cm->mi_params.mi_rows - mi_row);
  ------------------
  |  |   34|  9.91M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 9.74M, False: 163k]
  |  |  ------------------
  ------------------
  199|  9.91M|  end_mi = AOMMIN(end_mi, mi_size_high[BLOCK_64X64]);
  ------------------
  |  |   34|  9.91M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 9.62M, False: 287k]
  |  |  ------------------
  ------------------
  200|  9.91M|  const int n8_h_8 = mi_size_high[BLOCK_8X8];
  201|  9.91M|  const int n8_h_16 = mi_size_high[BLOCK_16X16];
  202|  9.91M|  int i;
  203|  9.91M|  int row_offset = 0;
  204|  9.91M|  if (abs(col_offset) > 1) {
  ------------------
  |  Branch (204:7): [True: 5.43M, False: 4.47M]
  ------------------
  205|  5.43M|    row_offset = 1;
  206|  5.43M|    if ((mi_row & 0x01) && xd->height < n8_h_8) --row_offset;
  ------------------
  |  Branch (206:9): [True: 835k, False: 4.60M]
  |  Branch (206:28): [True: 835k, False: 18.4E]
  ------------------
  207|  5.43M|  }
  208|  9.91M|  const int use_step_16 = (xd->height >= 16);
  209|       |
  210|  21.5M|  for (i = 0; i < end_mi;) {
  ------------------
  |  Branch (210:15): [True: 11.6M, False: 9.91M]
  ------------------
  211|  11.6M|    const MB_MODE_INFO *const candidate =
  212|  11.6M|        xd->mi[(row_offset + i) * xd->mi_stride + col_offset];
  213|  11.6M|    const int candidate_bsize = candidate->bsize;
  214|  11.6M|    const int n4_h = mi_size_high[candidate_bsize];
  215|  11.6M|    int len = AOMMIN(xd->height, n4_h);
  ------------------
  |  |   34|  11.6M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 3.94M, False: 7.65M]
  |  |  ------------------
  ------------------
  216|  11.6M|    if (use_step_16)
  ------------------
  |  Branch (216:9): [True: 472k, False: 11.1M]
  ------------------
  217|   472k|      len = AOMMAX(n8_h_16, len);
  ------------------
  |  |   35|   472k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 64.8k, False: 407k]
  |  |  ------------------
  ------------------
  218|  11.1M|    else if (abs(col_offset) > 1)
  ------------------
  |  Branch (218:14): [True: 6.22M, False: 4.89M]
  ------------------
  219|  6.22M|      len = AOMMAX(len, n8_h_8);
  ------------------
  |  |   35|  6.22M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 1.27M, False: 4.94M]
  |  |  ------------------
  ------------------
  220|       |
  221|  11.6M|    int weight = 2;
  222|  11.6M|    if (xd->height >= n8_h_8 && xd->height <= n4_h) {
  ------------------
  |  Branch (222:9): [True: 8.95M, False: 2.65M]
  |  Branch (222:33): [True: 5.67M, False: 3.27M]
  ------------------
  223|  5.67M|      int inc = AOMMIN(-max_col_offset + col_offset + 1,
  ------------------
  |  |   34|  5.67M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.85M, False: 3.81M]
  |  |  ------------------
  ------------------
  224|  5.67M|                       mi_size_wide[candidate_bsize]);
  225|       |      // Obtain range used in weight calculation.
  226|  5.67M|      weight = AOMMAX(weight, inc);
  ------------------
  |  |   35|  5.67M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 865k, False: 4.80M]
  |  |  ------------------
  ------------------
  227|       |      // Update processed cols.
  228|  5.67M|      *processed_cols = inc - col_offset - 1;
  229|  5.67M|    }
  230|       |
  231|  11.6M|    add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
  232|  11.6M|                         newmv_count, ref_mv_stack, ref_mv_weight,
  233|  11.6M|                         gm_mv_candidates, cm->global_motion, len * weight);
  234|       |
  235|  11.6M|    i += len;
  236|  11.6M|  }
  237|  9.91M|}
mvref_common.c:scan_blk_mbmi:
  246|  7.19M|                                 uint8_t *refmv_count) {
  247|  7.19M|  const TileInfo *const tile = &xd->tile;
  248|  7.19M|  POSITION mi_pos;
  249|       |
  250|  7.19M|  mi_pos.row = row_offset;
  251|  7.19M|  mi_pos.col = col_offset;
  252|       |
  253|  7.19M|  if (is_inside(tile, mi_col, mi_row, &mi_pos)) {
  ------------------
  |  Branch (253:7): [True: 6.71M, False: 485k]
  ------------------
  254|  6.71M|    const MB_MODE_INFO *const candidate =
  255|  6.71M|        xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col];
  256|  6.71M|    const int len = mi_size_wide[BLOCK_8X8];
  257|       |
  258|  6.71M|    add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
  259|  6.71M|                         newmv_count, ref_mv_stack, ref_mv_weight,
  260|  6.71M|                         gm_mv_candidates, cm->global_motion, 2 * len);
  261|  6.71M|  }  // Analyze a single 8x8 block motion information.
  262|  7.19M|}
mvref_common.c:add_tpl_ref_mv:
  335|  17.7M|                          int16_t *mode_context) {
  336|  17.7M|  POSITION mi_pos;
  337|  17.7M|  mi_pos.row = (mi_row & 0x01) ? blk_row : blk_row + 1;
  ------------------
  |  Branch (337:16): [True: 551k, False: 17.1M]
  ------------------
  338|  17.7M|  mi_pos.col = (mi_col & 0x01) ? blk_col : blk_col + 1;
  ------------------
  |  Branch (338:16): [True: 459k, False: 17.2M]
  ------------------
  339|       |
  340|  17.7M|  if (!is_inside(&xd->tile, mi_col, mi_row, &mi_pos)) return 0;
  ------------------
  |  Branch (340:7): [True: 115k, False: 17.6M]
  ------------------
  341|       |
  342|  17.6M|  const TPL_MV_REF *prev_frame_mvs =
  343|  17.6M|      cm->tpl_mvs +
  344|  17.6M|      ((mi_row + mi_pos.row) >> 1) * (cm->mi_params.mi_stride >> 1) +
  345|  17.6M|      ((mi_col + mi_pos.col) >> 1);
  346|  17.6M|  if (prev_frame_mvs->mfmv0.as_int == INVALID_MV) return 0;
  ------------------
  |  |   26|  17.6M|#define INVALID_MV 0x80008000
  ------------------
  |  Branch (346:7): [True: 15.9M, False: 1.67M]
  ------------------
  347|       |
  348|  1.67M|  MV_REFERENCE_FRAME rf[2];
  349|  1.67M|  av1_set_ref_frame(rf, ref_frame);
  350|       |
  351|  1.67M|  const uint16_t weight_unit = 1;  // mi_size_wide[BLOCK_8X8];
  352|  1.67M|  const int cur_frame_index = cm->cur_frame->order_hint;
  353|  1.67M|  const RefCntBuffer *const buf_0 = get_ref_frame_buf(cm, rf[0]);
  354|  1.67M|  const int frame0_index = buf_0->order_hint;
  355|  1.67M|  const int cur_offset_0 = get_relative_dist(&cm->seq_params->order_hint_info,
  356|  1.67M|                                             cur_frame_index, frame0_index);
  357|  1.67M|  int idx;
  358|  1.67M|  const int allow_high_precision_mv = cm->features.allow_high_precision_mv;
  359|  1.67M|  const int force_integer_mv = cm->features.cur_frame_force_integer_mv;
  360|       |
  361|  1.67M|  int_mv this_refmv;
  362|  1.67M|  av1_get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
  363|  1.67M|                        cur_offset_0, prev_frame_mvs->ref_frame_offset);
  364|  1.67M|  lower_mv_precision(&this_refmv.as_mv, allow_high_precision_mv,
  365|  1.67M|                     force_integer_mv);
  366|       |
  367|  1.67M|  if (rf[1] == NONE_FRAME) {
  ------------------
  |  Branch (367:7): [True: 915k, False: 759k]
  ------------------
  368|   915k|    if (blk_row == 0 && blk_col == 0) {
  ------------------
  |  Branch (368:9): [True: 367k, False: 548k]
  |  Branch (368:25): [True: 142k, False: 224k]
  ------------------
  369|   142k|      if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
  ------------------
  |  Branch (369:11): [True: 18.5k, False: 123k]
  ------------------
  370|   142k|          abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16)
  ------------------
  |  Branch (370:11): [True: 51.5k, False: 72.1k]
  ------------------
  371|  70.2k|        mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
  ------------------
  |  |  487|  70.2k|#define GLOBALMV_OFFSET 3
  ------------------
  372|   142k|    }
  373|       |
  374|  2.09M|    for (idx = 0; idx < *refmv_count; ++idx)
  ------------------
  |  Branch (374:19): [True: 1.90M, False: 188k]
  ------------------
  375|  1.90M|      if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int) break;
  ------------------
  |  Branch (375:11): [True: 727k, False: 1.17M]
  ------------------
  376|       |
  377|   915k|    if (idx < *refmv_count) ref_mv_weight[idx] += 2 * weight_unit;
  ------------------
  |  Branch (377:9): [True: 727k, False: 188k]
  ------------------
  378|       |
  379|   915k|    if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
  ------------------
  |  |  510|   192k|#define MAX_REF_MV_STACK_SIZE 8
  ------------------
  |  Branch (379:9): [True: 192k, False: 723k]
  |  Branch (379:32): [True: 191k, False: 1.11k]
  ------------------
  380|   191k|      ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
  381|   191k|      ref_mv_weight[idx] = 2 * weight_unit;
  382|   191k|      ++(*refmv_count);
  383|   191k|    }
  384|   915k|  } else {
  385|       |    // Process compound inter mode
  386|   759k|    const RefCntBuffer *const buf_1 = get_ref_frame_buf(cm, rf[1]);
  387|   759k|    const int frame1_index = buf_1->order_hint;
  388|   759k|    const int cur_offset_1 = get_relative_dist(&cm->seq_params->order_hint_info,
  389|   759k|                                               cur_frame_index, frame1_index);
  390|   759k|    int_mv comp_refmv;
  391|   759k|    av1_get_mv_projection(&comp_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
  392|   759k|                          cur_offset_1, prev_frame_mvs->ref_frame_offset);
  393|   759k|    lower_mv_precision(&comp_refmv.as_mv, allow_high_precision_mv,
  394|   759k|                       force_integer_mv);
  395|       |
  396|   759k|    if (blk_row == 0 && blk_col == 0) {
  ------------------
  |  Branch (396:9): [True: 238k, False: 521k]
  |  Branch (396:25): [True: 65.6k, False: 172k]
  ------------------
  397|  65.6k|      if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
  ------------------
  |  Branch (397:11): [True: 3.90k, False: 61.7k]
  ------------------
  398|  65.6k|          abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16 ||
  ------------------
  |  Branch (398:11): [True: 28.6k, False: 33.1k]
  ------------------
  399|  65.6k|          abs(comp_refmv.as_mv.row - gm_mv_candidates[1].as_mv.row) >= 16 ||
  ------------------
  |  Branch (399:11): [True: 1.10k, False: 32.0k]
  ------------------
  400|  65.6k|          abs(comp_refmv.as_mv.col - gm_mv_candidates[1].as_mv.col) >= 16)
  ------------------
  |  Branch (400:11): [True: 2.53k, False: 29.4k]
  ------------------
  401|  36.2k|        mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
  ------------------
  |  |  487|  36.2k|#define GLOBALMV_OFFSET 3
  ------------------
  402|  65.6k|    }
  403|       |
  404|  1.44M|    for (idx = 0; idx < *refmv_count; ++idx) {
  ------------------
  |  Branch (404:19): [True: 1.35M, False: 94.0k]
  ------------------
  405|  1.35M|      if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int &&
  ------------------
  |  Branch (405:11): [True: 677k, False: 675k]
  ------------------
  406|  1.35M|          comp_refmv.as_int == ref_mv_stack[idx].comp_mv.as_int)
  ------------------
  |  Branch (406:11): [True: 665k, False: 12.0k]
  ------------------
  407|   665k|        break;
  408|  1.35M|    }
  409|       |
  410|   759k|    if (idx < *refmv_count) ref_mv_weight[idx] += 2 * weight_unit;
  ------------------
  |  Branch (410:9): [True: 666k, False: 93.8k]
  ------------------
  411|       |
  412|   759k|    if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
  ------------------
  |  |  510|   108k|#define MAX_REF_MV_STACK_SIZE 8
  ------------------
  |  Branch (412:9): [True: 108k, False: 651k]
  |  Branch (412:32): [True: 105k, False: 2.36k]
  ------------------
  413|   105k|      ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
  414|   105k|      ref_mv_stack[idx].comp_mv.as_int = comp_refmv.as_int;
  415|   105k|      ref_mv_weight[idx] = 2 * weight_unit;
  416|   105k|      ++(*refmv_count);
  417|   105k|    }
  418|   759k|  }
  419|       |
  420|  1.67M|  return 1;
  421|  17.6M|}
mvref_common.c:check_sb_border:
  317|  6.63M|                           const int row_offset, const int col_offset) {
  318|  6.63M|  const int sb_mi_size = mi_size_wide[BLOCK_64X64];
  319|  6.63M|  const int row = mi_row & (sb_mi_size - 1);
  320|  6.63M|  const int col = mi_col & (sb_mi_size - 1);
  321|       |
  322|  6.63M|  if (row + row_offset < 0 || row + row_offset >= sb_mi_size ||
  ------------------
  |  Branch (322:7): [True: 18.4E, False: 6.63M]
  |  Branch (322:31): [True: 977k, False: 5.65M]
  ------------------
  323|  6.63M|      col + col_offset < 0 || col + col_offset >= sb_mi_size)
  ------------------
  |  Branch (323:7): [True: 412k, False: 5.24M]
  |  Branch (323:31): [True: 966k, False: 4.27M]
  ------------------
  324|  2.35M|    return 0;
  325|       |
  326|  4.27M|  return 1;
  327|  6.63M|}
mvref_common.c:process_compound_ref_mv_candidate:
  426|   806k|    int ref_id_count[2], int_mv ref_diff[2][2], int ref_diff_count[2]) {
  427|  2.41M|  for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
  ------------------
  |  Branch (427:24): [True: 1.61M, False: 806k]
  ------------------
  428|  1.61M|    MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx];
  429|       |
  430|  4.83M|    for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) {
  ------------------
  |  Branch (430:27): [True: 3.22M, False: 1.61M]
  ------------------
  431|  3.22M|      if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) {
  ------------------
  |  Branch (431:11): [True: 919k, False: 2.30M]
  |  Branch (431:36): [True: 887k, False: 31.9k]
  ------------------
  432|   887k|        ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx];
  433|   887k|        ++ref_id_count[cmp_idx];
  434|  2.33M|      } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) {
  ------------------
  |  Branch (434:18): [True: 1.50M, False: 829k]
  |  Branch (434:42): [True: 1.21M, False: 294k]
  ------------------
  435|  1.21M|        int_mv this_mv = candidate->mv[rf_idx];
  436|  1.21M|        if (cm->ref_frame_sign_bias[can_rf] !=
  ------------------
  |  Branch (436:13): [True: 259k, False: 953k]
  ------------------
  437|  1.21M|            cm->ref_frame_sign_bias[rf[cmp_idx]]) {
  438|   259k|          this_mv.as_mv.row = -this_mv.as_mv.row;
  439|   259k|          this_mv.as_mv.col = -this_mv.as_mv.col;
  440|   259k|        }
  441|  1.21M|        ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv;
  442|  1.21M|        ++ref_diff_count[cmp_idx];
  443|  1.21M|      }
  444|  3.22M|    }
  445|  1.61M|  }
  446|   806k|}
mvref_common.c:process_single_ref_mv_candidate:
  452|  2.88M|    uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE]) {
  453|  8.65M|  for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
  ------------------
  |  Branch (453:24): [True: 5.76M, False: 2.88M]
  ------------------
  454|  5.76M|    if (candidate->ref_frame[rf_idx] > INTRA_FRAME) {
  ------------------
  |  Branch (454:9): [True: 2.39M, False: 3.36M]
  ------------------
  455|  2.39M|      int_mv this_mv = candidate->mv[rf_idx];
  456|  2.39M|      if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] !=
  ------------------
  |  Branch (456:11): [True: 59.1k, False: 2.33M]
  ------------------
  457|  2.39M|          cm->ref_frame_sign_bias[ref_frame]) {
  458|  59.1k|        this_mv.as_mv.row = -this_mv.as_mv.row;
  459|  59.1k|        this_mv.as_mv.col = -this_mv.as_mv.col;
  460|  59.1k|      }
  461|  2.39M|      int stack_idx;
  462|  2.73M|      for (stack_idx = 0; stack_idx < *refmv_count; ++stack_idx) {
  ------------------
  |  Branch (462:27): [True: 2.29M, False: 443k]
  ------------------
  463|  2.29M|        const int_mv stack_mv = ref_mv_stack[stack_idx].this_mv;
  464|  2.29M|        if (this_mv.as_int == stack_mv.as_int) break;
  ------------------
  |  Branch (464:13): [True: 1.95M, False: 337k]
  ------------------
  465|  2.29M|      }
  466|       |
  467|  2.39M|      if (stack_idx == *refmv_count) {
  ------------------
  |  Branch (467:11): [True: 443k, False: 1.95M]
  ------------------
  468|   443k|        ref_mv_stack[stack_idx].this_mv = this_mv;
  469|       |
  470|       |        // TODO(jingning): Set an arbitrary small number here. The weight
  471|       |        // doesn't matter as long as it is properly initialized.
  472|   443k|        ref_mv_weight[stack_idx] = 2;
  473|   443k|        ++(*refmv_count);
  474|   443k|      }
  475|  2.39M|    }
  476|  5.76M|  }
  477|  2.88M|}
mvref_common.c:motion_field_projection:
  919|  45.0k|                                   MV_REFERENCE_FRAME start_frame, int dir) {
  920|  45.0k|  TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
  921|  45.0k|  int ref_offset[REF_FRAMES] = { 0 };
  922|       |
  923|  45.0k|  const RefCntBuffer *const start_frame_buf =
  924|  45.0k|      get_ref_frame_buf(cm, start_frame);
  925|  45.0k|  if (start_frame_buf == NULL) return 0;
  ------------------
  |  Branch (925:7): [True: 0, False: 45.0k]
  ------------------
  926|       |
  927|  45.0k|  if (start_frame_buf->frame_type == KEY_FRAME ||
  ------------------
  |  Branch (927:7): [True: 28.7k, False: 16.2k]
  ------------------
  928|  45.0k|      start_frame_buf->frame_type == INTRA_ONLY_FRAME)
  ------------------
  |  Branch (928:7): [True: 295, False: 15.9k]
  ------------------
  929|  29.0k|    return 0;
  930|       |
  931|  15.9k|  if (start_frame_buf->mi_rows != cm->mi_params.mi_rows ||
  ------------------
  |  Branch (931:7): [True: 15, False: 15.9k]
  ------------------
  932|  15.9k|      start_frame_buf->mi_cols != cm->mi_params.mi_cols)
  ------------------
  |  Branch (932:7): [True: 10, False: 15.9k]
  ------------------
  933|     25|    return 0;
  934|       |
  935|  15.9k|  const int start_frame_order_hint = start_frame_buf->order_hint;
  936|  15.9k|  const unsigned int *const ref_order_hints =
  937|  15.9k|      &start_frame_buf->ref_order_hints[0];
  938|  15.9k|  const int cur_order_hint = cm->cur_frame->order_hint;
  939|  15.9k|  int start_to_current_frame_offset = get_relative_dist(
  940|  15.9k|      &cm->seq_params->order_hint_info, start_frame_order_hint, cur_order_hint);
  941|       |
  942|   127k|  for (MV_REFERENCE_FRAME rf = LAST_FRAME; rf <= INTER_REFS_PER_FRAME; ++rf) {
  ------------------
  |  Branch (942:44): [True: 111k, False: 15.9k]
  ------------------
  943|   111k|    ref_offset[rf] = get_relative_dist(&cm->seq_params->order_hint_info,
  944|   111k|                                       start_frame_order_hint,
  945|   111k|                                       ref_order_hints[rf - LAST_FRAME]);
  946|   111k|  }
  947|       |
  948|  15.9k|  if (dir == 2) start_to_current_frame_offset = -start_to_current_frame_offset;
  ------------------
  |  Branch (948:7): [True: 10.9k, False: 4.98k]
  ------------------
  949|       |
  950|  15.9k|  MV_REF *mv_ref_base = start_frame_buf->mvs;
  951|  15.9k|  const int mvs_rows = (cm->mi_params.mi_rows + 1) >> 1;
  952|  15.9k|  const int mvs_cols = (cm->mi_params.mi_cols + 1) >> 1;
  953|       |
  954|   484k|  for (int blk_row = 0; blk_row < mvs_rows; ++blk_row) {
  ------------------
  |  Branch (954:25): [True: 468k, False: 15.9k]
  ------------------
  955|  26.9M|    for (int blk_col = 0; blk_col < mvs_cols; ++blk_col) {
  ------------------
  |  Branch (955:27): [True: 26.4M, False: 468k]
  ------------------
  956|  26.4M|      MV_REF *mv_ref = &mv_ref_base[blk_row * mvs_cols + blk_col];
  957|  26.4M|      MV fwd_mv = mv_ref->mv.as_mv;
  958|       |
  959|  26.4M|      if (mv_ref->ref_frame > INTRA_FRAME) {
  ------------------
  |  Branch (959:11): [True: 19.4M, False: 7.00M]
  ------------------
  960|  19.4M|        int_mv this_mv;
  961|  19.4M|        int mi_r, mi_c;
  962|  19.4M|        const int ref_frame_offset = ref_offset[mv_ref->ref_frame];
  963|       |
  964|  19.4M|        int pos_valid =
  965|  19.4M|            abs(ref_frame_offset) <= MAX_FRAME_DISTANCE &&
  ------------------
  |  |   68|  38.9M|#define MAX_FRAME_DISTANCE ((1 << FRAME_OFFSET_BITS) - 1)
  |  |  ------------------
  |  |  |  |   67|  19.4M|#define FRAME_OFFSET_BITS 5
  |  |  ------------------
  ------------------
  |  Branch (965:13): [True: 16.0M, False: 3.41M]
  ------------------
  966|  19.4M|            ref_frame_offset > 0 &&
  ------------------
  |  Branch (966:13): [True: 16.0M, False: 0]
  ------------------
  967|  19.4M|            abs(start_to_current_frame_offset) <= MAX_FRAME_DISTANCE;
  ------------------
  |  |   68|  16.0M|#define MAX_FRAME_DISTANCE ((1 << FRAME_OFFSET_BITS) - 1)
  |  |  ------------------
  |  |  |  |   67|  16.0M|#define FRAME_OFFSET_BITS 5
  |  |  ------------------
  ------------------
  |  Branch (967:13): [True: 16.0M, False: 18.5k]
  ------------------
  968|       |
  969|  19.4M|        if (pos_valid) {
  ------------------
  |  Branch (969:13): [True: 16.0M, False: 3.43M]
  ------------------
  970|  16.0M|          av1_get_mv_projection(&this_mv.as_mv, fwd_mv,
  971|  16.0M|                                start_to_current_frame_offset,
  972|  16.0M|                                ref_frame_offset);
  973|  16.0M|          pos_valid = get_block_position(cm, &mi_r, &mi_c, blk_row, blk_col,
  974|  16.0M|                                         this_mv.as_mv, dir >> 1);
  975|  16.0M|        }
  976|       |
  977|  19.4M|        if (pos_valid) {
  ------------------
  |  Branch (977:13): [True: 13.5M, False: 5.89M]
  ------------------
  978|  13.5M|          const int mi_offset = mi_r * (cm->mi_params.mi_stride >> 1) + mi_c;
  979|       |
  980|  13.5M|          tpl_mvs_base[mi_offset].mfmv0.as_mv.row = fwd_mv.row;
  981|  13.5M|          tpl_mvs_base[mi_offset].mfmv0.as_mv.col = fwd_mv.col;
  982|  13.5M|          tpl_mvs_base[mi_offset].ref_frame_offset = ref_frame_offset;
  983|  13.5M|        }
  984|  19.4M|      }
  985|  26.4M|    }
  986|   468k|  }
  987|       |
  988|  15.9k|  return 1;
  989|  15.9k|}
mvref_common.c:get_block_position:
  881|  16.0M|                              int blk_col, MV mv, int sign_bias) {
  882|  16.0M|  const int base_blk_row = (blk_row >> 3) << 3;
  883|  16.0M|  const int base_blk_col = (blk_col >> 3) << 3;
  884|       |
  885|  16.0M|  const int row_offset = (mv.row >= 0) ? (mv.row >> (4 + MI_SIZE_LOG2))
  ------------------
  |  |   39|  11.3M|#define MI_SIZE_LOG2 2
  ------------------
  |  Branch (885:26): [True: 11.3M, False: 4.70M]
  ------------------
  886|  16.0M|                                       : -((-mv.row) >> (4 + MI_SIZE_LOG2));
  ------------------
  |  |   39|  4.70M|#define MI_SIZE_LOG2 2
  ------------------
  887|       |
  888|  16.0M|  const int col_offset = (mv.col >= 0) ? (mv.col >> (4 + MI_SIZE_LOG2))
  ------------------
  |  |   39|  14.0M|#define MI_SIZE_LOG2 2
  ------------------
  |  Branch (888:26): [True: 14.0M, False: 2.01M]
  ------------------
  889|  16.0M|                                       : -((-mv.col) >> (4 + MI_SIZE_LOG2));
  ------------------
  |  |   39|  2.01M|#define MI_SIZE_LOG2 2
  ------------------
  890|       |
  891|  16.0M|  const int row =
  892|  16.0M|      (sign_bias == 1) ? blk_row - row_offset : blk_row + row_offset;
  ------------------
  |  Branch (892:7): [True: 8.56M, False: 7.45M]
  ------------------
  893|  16.0M|  const int col =
  894|  16.0M|      (sign_bias == 1) ? blk_col - col_offset : blk_col + col_offset;
  ------------------
  |  Branch (894:7): [True: 8.56M, False: 7.45M]
  ------------------
  895|       |
  896|  16.0M|  if (row < 0 || row >= (cm->mi_params.mi_rows >> 1) || col < 0 ||
  ------------------
  |  Branch (896:7): [True: 759k, False: 15.2M]
  |  Branch (896:18): [True: 472k, False: 14.7M]
  |  Branch (896:57): [True: 316k, False: 14.4M]
  ------------------
  897|  16.0M|      col >= (cm->mi_params.mi_cols >> 1))
  ------------------
  |  Branch (897:7): [True: 42.1k, False: 14.4M]
  ------------------
  898|  1.59M|    return 0;
  899|       |
  900|  14.4M|  if (row < base_blk_row - (MAX_OFFSET_HEIGHT >> 3) ||
  ------------------
  |  |  878|  14.4M|#define MAX_OFFSET_HEIGHT 0
  ------------------
  |  Branch (900:7): [True: 398k, False: 14.0M]
  ------------------
  901|  14.4M|      row >= base_blk_row + 8 + (MAX_OFFSET_HEIGHT >> 3) ||
  ------------------
  |  |  878|  14.0M|#define MAX_OFFSET_HEIGHT 0
  ------------------
  |  Branch (901:7): [True: 354k, False: 13.6M]
  ------------------
  902|  14.4M|      col < base_blk_col - (MAX_OFFSET_WIDTH >> 3) ||
  ------------------
  |  |  877|  13.6M|#define MAX_OFFSET_WIDTH 64
  ------------------
  |  Branch (902:7): [True: 107k, False: 13.5M]
  ------------------
  903|  14.4M|      col >= base_blk_col + 8 + (MAX_OFFSET_WIDTH >> 3))
  ------------------
  |  |  877|  13.5M|#define MAX_OFFSET_WIDTH 64
  ------------------
  |  Branch (903:7): [True: 3.52k, False: 13.5M]
  ------------------
  904|   864k|    return 0;
  905|       |
  906|  13.5M|  *mi_r = row;
  907|  13.5M|  *mi_c = col;
  908|       |
  909|  13.5M|  return 1;
  910|  14.4M|}
mvref_common.c:record_samples:
 1078|  5.28M|                                  int col_offset, int sign_c) {
 1079|  5.28M|  const int bw = block_size_wide[mbmi->bsize];
 1080|  5.28M|  const int bh = block_size_high[mbmi->bsize];
 1081|  5.28M|  const int x = col_offset * MI_SIZE + sign_c * bw / 2 - 1;
  ------------------
  |  |   40|  5.28M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  5.28M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1082|  5.28M|  const int y = row_offset * MI_SIZE + sign_r * bh / 2 - 1;
  ------------------
  |  |   40|  5.28M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  5.28M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1083|       |
 1084|  5.28M|  pts[0] = GET_MV_SUBPEL(x);
  ------------------
  |  |   29|  5.28M|#define GET_MV_SUBPEL(x) ((x)*8)
  ------------------
 1085|  5.28M|  pts[1] = GET_MV_SUBPEL(y);
  ------------------
  |  |   29|  5.28M|#define GET_MV_SUBPEL(x) ((x)*8)
  ------------------
 1086|  5.28M|  pts_inref[0] = pts[0] + mbmi->mv[0].as_mv.col;
 1087|  5.28M|  pts_inref[1] = pts[1] + mbmi->mv[0].as_mv.row;
 1088|  5.28M|}
mvref_common.c:has_top_right:
  265|  6.65M|                         int mi_row, int mi_col, int bs) {
  266|  6.65M|  const int sb_mi_size = mi_size_wide[cm->seq_params->sb_size];
  267|  6.65M|  const int mask_row = mi_row & (sb_mi_size - 1);
  268|  6.65M|  const int mask_col = mi_col & (sb_mi_size - 1);
  269|       |
  270|  6.65M|  if (bs > mi_size_wide[BLOCK_64X64]) return 0;
  ------------------
  |  Branch (270:7): [True: 237k, False: 6.41M]
  ------------------
  271|       |
  272|       |  // In a split partition all apart from the bottom right has a top right
  273|  6.41M|  int has_tr = !((mask_row & bs) && (mask_col & bs));
  ------------------
  |  Branch (273:18): [True: 3.16M, False: 3.25M]
  |  Branch (273:37): [True: 1.58M, False: 1.57M]
  ------------------
  274|       |
  275|       |  // bs > 0 and bs is a power of 2
  276|  6.41M|  assert(bs > 0 && !(bs & (bs - 1)));
  277|       |
  278|       |  // For each 4x4 group of blocks, when the bottom right is decoded the blocks
  279|       |  // to the right have not been decoded therefore the bottom right does
  280|       |  // not have a top right
  281|  9.61M|  while (bs < sb_mi_size) {
  ------------------
  |  Branch (281:10): [True: 9.08M, False: 534k]
  ------------------
  282|  9.08M|    if (mask_col & bs) {
  ------------------
  |  Branch (282:9): [True: 4.15M, False: 4.92M]
  ------------------
  283|  4.15M|      if ((mask_col & (2 * bs)) && (mask_row & (2 * bs))) {
  ------------------
  |  Branch (283:11): [True: 1.88M, False: 2.27M]
  |  Branch (283:36): [True: 954k, False: 927k]
  ------------------
  284|   954k|        has_tr = 0;
  285|   954k|        break;
  286|   954k|      }
  287|  4.92M|    } else {
  288|  4.92M|      break;
  289|  4.92M|    }
  290|  3.20M|    bs <<= 1;
  291|  3.20M|  }
  292|       |
  293|       |  // In a VERTICAL or VERTICAL_4 partition, all partition before the last one
  294|       |  // always have a top right (as the block above will have been decoded).
  295|  6.41M|  if (xd->width < xd->height) {
  ------------------
  |  Branch (295:7): [True: 1.42M, False: 4.98M]
  ------------------
  296|  1.42M|    if (!xd->is_last_vertical_rect) has_tr = 1;
  ------------------
  |  Branch (296:9): [True: 818k, False: 610k]
  ------------------
  297|  1.42M|  }
  298|       |
  299|       |  // In a HORIZONTAL or HORIZONTAL_4 partition, partitions after the first one
  300|       |  // never have a top right (as the block to the right won't have been decoded).
  301|  6.41M|  if (xd->width > xd->height) {
  ------------------
  |  Branch (301:7): [True: 2.22M, False: 4.18M]
  ------------------
  302|  2.22M|    if (!xd->is_first_horizontal_rect) has_tr = 0;
  ------------------
  |  Branch (302:9): [True: 1.31M, False: 913k]
  ------------------
  303|  2.22M|  }
  304|       |
  305|       |  // The bottom left square of a Vertical A (in the old format) does
  306|       |  // not have a top right as it is decoded before the right hand
  307|       |  // rectangle of the partition
  308|  6.41M|  if (xd->mi[0]->partition == PARTITION_VERT_A) {
  ------------------
  |  Branch (308:7): [True: 233k, False: 6.18M]
  ------------------
  309|   233k|    if (xd->width == xd->height)
  ------------------
  |  Branch (309:9): [True: 152k, False: 81.4k]
  ------------------
  310|   152k|      if (mask_row & bs) has_tr = 0;
  ------------------
  |  Branch (310:11): [True: 83.6k, False: 68.6k]
  ------------------
  311|   233k|  }
  312|       |
  313|  6.41M|  return has_tr;
  314|  6.41M|}
mvref_common.c:compare_ref_frame_info:
 1329|   383k|static int compare_ref_frame_info(const void *arg_a, const void *arg_b) {
 1330|   383k|  const REF_FRAME_INFO *info_a = (REF_FRAME_INFO *)arg_a;
 1331|   383k|  const REF_FRAME_INFO *info_b = (REF_FRAME_INFO *)arg_b;
 1332|       |
 1333|   383k|  const int sort_idx_diff = info_a->sort_idx - info_b->sort_idx;
 1334|   383k|  if (sort_idx_diff != 0) return sort_idx_diff;
  ------------------
  |  Branch (1334:7): [True: 258k, False: 125k]
  ------------------
 1335|   125k|  return info_a->map_idx - info_b->map_idx;
 1336|   383k|}
mvref_common.c:set_ref_frame_info:
 1339|   175k|                                      REF_FRAME_INFO *ref_info) {
 1340|   175k|  assert(frame_idx >= 0 && frame_idx < INTER_REFS_PER_FRAME);
 1341|       |
 1342|   175k|  remapped_ref_idx[frame_idx] = ref_info->map_idx;
 1343|   175k|}

decodemv.c:av1_find_ref_dv:
  268|  27.8k|                                   int mib_size, int mi_row) {
  269|  27.8k|  if (mi_row - mib_size < tile->mi_row_start) {
  ------------------
  |  Branch (269:7): [True: 23.6k, False: 4.17k]
  ------------------
  270|  23.6k|    ref_dv->as_fullmv.row = 0;
  271|  23.6k|    ref_dv->as_fullmv.col = -MI_SIZE * mib_size - INTRABC_DELAY_PIXELS;
  ------------------
  |  |   40|  23.6k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  23.6k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
                  ref_dv->as_fullmv.col = -MI_SIZE * mib_size - INTRABC_DELAY_PIXELS;
  ------------------
  |  |  264|  23.6k|#define INTRABC_DELAY_PIXELS 256  //  Delay of 256 pixels
  ------------------
  272|  23.6k|  } else {
  273|  4.17k|    ref_dv->as_fullmv.row = -MI_SIZE * mib_size;
  ------------------
  |  |   40|  4.17k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  4.17k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  274|  4.17k|    ref_dv->as_fullmv.col = 0;
  275|  4.17k|  }
  276|  27.8k|  convert_fullmv_to_mv(ref_dv);
  277|  27.8k|}
decodemv.c:av1_is_dv_valid:
  281|  60.0k|                                  BLOCK_SIZE bsize, int mib_size_log2) {
  282|  60.0k|  const int bw = block_size_wide[bsize];
  283|  60.0k|  const int bh = block_size_high[bsize];
  284|  60.0k|  const int SCALE_PX_TO_MV = 8;
  285|       |  // Disallow subpixel for now
  286|       |  // SUBPEL_MASK is not the correct scale
  287|  60.0k|  if (((dv.row & (SCALE_PX_TO_MV - 1)) || (dv.col & (SCALE_PX_TO_MV - 1))))
  ------------------
  |  Branch (287:8): [True: 0, False: 60.0k]
  |  Branch (287:43): [True: 1, False: 60.0k]
  ------------------
  288|      0|    return 0;
  289|       |
  290|  60.0k|  const TileInfo *const tile = &xd->tile;
  291|       |  // Is the source top-left inside the current tile?
  292|  60.0k|  const int src_top_edge = mi_row * MI_SIZE * SCALE_PX_TO_MV + dv.row;
  ------------------
  |  |   40|  60.0k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  60.0k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  293|  60.0k|  const int tile_top_edge = tile->mi_row_start * MI_SIZE * SCALE_PX_TO_MV;
  ------------------
  |  |   40|  60.0k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  60.0k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  294|  60.0k|  if (src_top_edge < tile_top_edge) return 0;
  ------------------
  |  Branch (294:7): [True: 1.18k, False: 58.8k]
  ------------------
  295|  58.8k|  const int src_left_edge = mi_col * MI_SIZE * SCALE_PX_TO_MV + dv.col;
  ------------------
  |  |   40|  58.8k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  58.8k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  296|  58.8k|  const int tile_left_edge = tile->mi_col_start * MI_SIZE * SCALE_PX_TO_MV;
  ------------------
  |  |   40|  58.8k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  58.8k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  297|  58.8k|  if (src_left_edge < tile_left_edge) return 0;
  ------------------
  |  Branch (297:7): [True: 5.09k, False: 53.8k]
  ------------------
  298|       |  // Is the bottom right inside the current tile?
  299|  53.8k|  const int src_bottom_edge = (mi_row * MI_SIZE + bh) * SCALE_PX_TO_MV + dv.row;
  ------------------
  |  |   40|  53.8k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  53.8k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  300|  53.8k|  const int tile_bottom_edge = tile->mi_row_end * MI_SIZE * SCALE_PX_TO_MV;
  ------------------
  |  |   40|  53.8k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  53.8k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  301|  53.8k|  if (src_bottom_edge > tile_bottom_edge) return 0;
  ------------------
  |  Branch (301:7): [True: 54, False: 53.7k]
  ------------------
  302|  53.7k|  const int src_right_edge = (mi_col * MI_SIZE + bw) * SCALE_PX_TO_MV + dv.col;
  ------------------
  |  |   40|  53.7k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  53.7k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  303|  53.7k|  const int tile_right_edge = tile->mi_col_end * MI_SIZE * SCALE_PX_TO_MV;
  ------------------
  |  |   40|  53.7k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  53.7k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  304|  53.7k|  if (src_right_edge > tile_right_edge) return 0;
  ------------------
  |  Branch (304:7): [True: 91, False: 53.6k]
  ------------------
  305|       |
  306|       |  // Special case for sub 8x8 chroma cases, to prevent referring to chroma
  307|       |  // pixels outside current tile.
  308|  53.6k|  if (xd->is_chroma_ref && av1_num_planes(cm) > 1) {
  ------------------
  |  Branch (308:7): [True: 50.3k, False: 3.29k]
  |  Branch (308:28): [True: 48.6k, False: 1.67k]
  ------------------
  309|  48.6k|    const struct macroblockd_plane *const pd = &xd->plane[1];
  310|  48.6k|    if (bw < 8 && pd->subsampling_x)
  ------------------
  |  Branch (310:9): [True: 10.6k, False: 38.0k]
  |  Branch (310:19): [True: 716, False: 9.89k]
  ------------------
  311|    716|      if (src_left_edge < tile_left_edge + 4 * SCALE_PX_TO_MV) return 0;
  ------------------
  |  Branch (311:11): [True: 1, False: 715]
  ------------------
  312|  48.6k|    if (bh < 8 && pd->subsampling_y)
  ------------------
  |  Branch (312:9): [True: 11.9k, False: 36.7k]
  |  Branch (312:19): [True: 1.39k, False: 10.5k]
  ------------------
  313|  1.39k|      if (src_top_edge < tile_top_edge + 4 * SCALE_PX_TO_MV) return 0;
  ------------------
  |  Branch (313:11): [True: 1, False: 1.39k]
  ------------------
  314|  48.6k|  }
  315|       |
  316|       |  // Is the bottom right within an already coded SB? Also consider additional
  317|       |  // constraints to facilitate HW decoder.
  318|  53.6k|  const int max_mib_size = 1 << mib_size_log2;
  319|  53.6k|  const int active_sb_row = mi_row >> mib_size_log2;
  320|  53.6k|  const int active_sb64_col = (mi_col * MI_SIZE) >> 6;
  ------------------
  |  |   40|  53.6k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  53.6k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  321|  53.6k|  const int sb_size = max_mib_size * MI_SIZE;
  ------------------
  |  |   40|  53.6k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  53.6k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  322|  53.6k|  const int src_sb_row = ((src_bottom_edge >> 3) - 1) / sb_size;
  323|  53.6k|  const int src_sb64_col = ((src_right_edge >> 3) - 1) >> 6;
  324|  53.6k|  const int total_sb64_per_row =
  325|  53.6k|      ((tile->mi_col_end - tile->mi_col_start - 1) >> 4) + 1;
  326|  53.6k|  const int active_sb64 = active_sb_row * total_sb64_per_row + active_sb64_col;
  327|  53.6k|  const int src_sb64 = src_sb_row * total_sb64_per_row + src_sb64_col;
  328|  53.6k|  if (src_sb64 >= active_sb64 - INTRABC_DELAY_SB64) return 0;
  ------------------
  |  |  265|  53.6k|#define INTRABC_DELAY_SB64 (INTRABC_DELAY_PIXELS / 64)
  |  |  ------------------
  |  |  |  |  264|  53.6k|#define INTRABC_DELAY_PIXELS 256  //  Delay of 256 pixels
  |  |  ------------------
  ------------------
  |  Branch (328:7): [True: 232, False: 53.4k]
  ------------------
  329|       |
  330|       |  // Wavefront constraint: use only top left area of frame for reference.
  331|  53.4k|  const int gradient = 1 + INTRABC_DELAY_SB64 + (sb_size > 64);
  ------------------
  |  |  265|  53.4k|#define INTRABC_DELAY_SB64 (INTRABC_DELAY_PIXELS / 64)
  |  |  ------------------
  |  |  |  |  264|  53.4k|#define INTRABC_DELAY_PIXELS 256  //  Delay of 256 pixels
  |  |  ------------------
  ------------------
  332|  53.4k|  const int wf_offset = gradient * (active_sb_row - src_sb_row);
  333|  53.4k|  if (src_sb_row > active_sb_row ||
  ------------------
  |  Branch (333:7): [True: 8, False: 53.4k]
  ------------------
  334|  53.4k|      src_sb64_col >= active_sb64_col - INTRABC_DELAY_SB64 + wf_offset)
  ------------------
  |  |  265|  53.4k|#define INTRABC_DELAY_SB64 (INTRABC_DELAY_PIXELS / 64)
  |  |  ------------------
  |  |  |  |  264|  53.4k|#define INTRABC_DELAY_PIXELS 256  //  Delay of 256 pixels
  |  |  ------------------
  ------------------
  |  Branch (334:7): [True: 10, False: 53.4k]
  ------------------
  335|     10|    return 0;
  336|       |
  337|  53.4k|  return 1;
  338|  53.4k|}
decodemv.c:av1_collect_neighbors_ref_counts:
  209|  4.55M|static inline void av1_collect_neighbors_ref_counts(MACROBLOCKD *const xd) {
  210|  4.55M|  av1_zero(xd->neighbors_ref_counts);
  ------------------
  |  |   43|  4.55M|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
  211|       |
  212|  4.55M|  uint8_t *const ref_counts = xd->neighbors_ref_counts;
  213|       |
  214|  4.55M|  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
  215|  4.55M|  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
  216|  4.55M|  const int above_in_image = xd->up_available;
  217|  4.55M|  const int left_in_image = xd->left_available;
  218|       |
  219|       |  // Above neighbor
  220|  4.55M|  if (above_in_image && is_inter_block(above_mbmi)) {
  ------------------
  |  Branch (220:7): [True: 4.33M, False: 215k]
  |  Branch (220:25): [True: 3.85M, False: 487k]
  ------------------
  221|  3.85M|    ref_counts[above_mbmi->ref_frame[0]]++;
  222|  3.85M|    if (has_second_ref(above_mbmi)) {
  ------------------
  |  Branch (222:9): [True: 554k, False: 3.29M]
  ------------------
  223|   554k|      ref_counts[above_mbmi->ref_frame[1]]++;
  224|   554k|    }
  225|  3.85M|  }
  226|       |
  227|       |  // Left neighbor
  228|  4.55M|  if (left_in_image && is_inter_block(left_mbmi)) {
  ------------------
  |  Branch (228:7): [True: 4.41M, False: 138k]
  |  Branch (228:24): [True: 3.91M, False: 499k]
  ------------------
  229|  3.91M|    ref_counts[left_mbmi->ref_frame[0]]++;
  230|  3.91M|    if (has_second_ref(left_mbmi)) {
  ------------------
  |  Branch (230:9): [True: 605k, False: 3.30M]
  ------------------
  231|   605k|      ref_counts[left_mbmi->ref_frame[1]]++;
  232|   605k|    }
  233|  3.91M|  }
  234|  4.55M|}
decodemv.c:av1_ref_frame_type:
  113|  10.8M|static inline int8_t av1_ref_frame_type(const MV_REFERENCE_FRAME *const rf) {
  114|  10.8M|  if (rf[1] > INTRA_FRAME) {
  ------------------
  |  Branch (114:7): [True: 1.38M, False: 9.48M]
  ------------------
  115|  1.38M|    const int8_t uni_comp_ref_idx = get_uni_comp_ref_idx(rf);
  116|  1.38M|    if (uni_comp_ref_idx >= 0) {
  ------------------
  |  Branch (116:9): [True: 292k, False: 1.09M]
  ------------------
  117|   292k|      assert((REF_FRAMES + FWD_REFS * BWD_REFS + uni_comp_ref_idx) <
  118|   292k|             MODE_CTX_REF_FRAMES);
  119|   292k|      return REF_FRAMES + FWD_REFS * BWD_REFS + uni_comp_ref_idx;
  120|  1.09M|    } else {
  121|  1.09M|      return REF_FRAMES + FWD_RF_OFFSET(rf[0]) +
  ------------------
  |  |  569|  1.09M|#define FWD_RF_OFFSET(ref) (ref - LAST_FRAME)
  ------------------
  122|  1.09M|             BWD_RF_OFFSET(rf[1]) * FWD_REFS;
  ------------------
  |  |  570|  1.09M|#define BWD_RF_OFFSET(ref) (ref - BWDREF_FRAME)
  ------------------
  123|  1.09M|    }
  124|  1.38M|  }
  125|       |
  126|  9.48M|  return rf[0];
  127|  10.8M|}
decodemv.c:get_uni_comp_ref_idx:
   99|  1.38M|static inline int8_t get_uni_comp_ref_idx(const MV_REFERENCE_FRAME *const rf) {
  100|       |  // Single ref pred
  101|  1.38M|  if (rf[1] <= INTRA_FRAME) return -1;
  ------------------
  |  Branch (101:7): [True: 0, False: 1.38M]
  ------------------
  102|       |
  103|       |  // Bi-directional comp ref pred
  104|  1.38M|  if ((rf[0] < BWDREF_FRAME) && (rf[1] >= BWDREF_FRAME)) return -1;
  ------------------
  |  Branch (104:7): [True: 1.33M, False: 55.2k]
  |  Branch (104:33): [True: 1.09M, False: 236k]
  ------------------
  105|       |
  106|   772k|  for (int8_t ref_idx = 0; ref_idx < TOTAL_UNIDIR_COMP_REFS; ++ref_idx) {
  ------------------
  |  Branch (106:28): [True: 772k, False: 18.4E]
  ------------------
  107|   772k|    if (rf[0] == comp_ref0(ref_idx) && rf[1] == comp_ref1(ref_idx))
  ------------------
  |  Branch (107:9): [True: 479k, False: 292k]
  |  Branch (107:40): [True: 292k, False: 187k]
  ------------------
  108|   292k|      return ref_idx;
  109|   772k|  }
  110|  18.4E|  return -1;
  111|   292k|}
decodemv.c:av1_mode_context_analyzer:
  171|  3.85M|    const int16_t *const mode_context, const MV_REFERENCE_FRAME *const rf) {
  172|  3.85M|  const int8_t ref_frame = av1_ref_frame_type(rf);
  173|       |
  174|  3.85M|  if (rf[1] <= INTRA_FRAME) return mode_context[ref_frame];
  ------------------
  |  Branch (174:7): [True: 3.33M, False: 523k]
  ------------------
  175|       |
  176|   523k|  const int16_t newmv_ctx = mode_context[ref_frame] & NEWMV_CTX_MASK;
  ------------------
  |  |  490|   523k|#define NEWMV_CTX_MASK ((1 << GLOBALMV_OFFSET) - 1)
  |  |  ------------------
  |  |  |  |  487|   523k|#define GLOBALMV_OFFSET 3
  |  |  ------------------
  ------------------
  177|   523k|  const int16_t refmv_ctx =
  178|   523k|      (mode_context[ref_frame] >> REFMV_OFFSET) & REFMV_CTX_MASK;
  ------------------
  |  |  488|   523k|#define REFMV_OFFSET 4
  ------------------
                    (mode_context[ref_frame] >> REFMV_OFFSET) & REFMV_CTX_MASK;
  ------------------
  |  |  492|   523k|#define REFMV_CTX_MASK ((1 << (8 - REFMV_OFFSET)) - 1)
  |  |  ------------------
  |  |  |  |  488|   523k|#define REFMV_OFFSET 4
  |  |  ------------------
  ------------------
  179|       |
  180|   523k|  const int16_t comp_ctx = compound_mode_ctx_map[refmv_ctx >> 1][AOMMIN(
  ------------------
  |  |   34|   523k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 403k, False: 119k]
  |  |  ------------------
  ------------------
  181|   523k|      newmv_ctx, COMP_NEWMV_CTXS - 1)];
  182|   523k|  return comp_ctx;
  183|  3.85M|}
decodemv.c:av1_drl_ctx:
  185|  2.46M|static inline uint8_t av1_drl_ctx(const uint16_t *ref_mv_weight, int ref_idx) {
  186|  2.46M|  if (ref_mv_weight[ref_idx] >= REF_CAT_LEVEL &&
  ------------------
  |  |  512|  4.93M|#define REF_CAT_LEVEL 640
  ------------------
  |  Branch (186:7): [True: 2.24M, False: 220k]
  ------------------
  187|  2.46M|      ref_mv_weight[ref_idx + 1] >= REF_CAT_LEVEL)
  ------------------
  |  |  512|  2.24M|#define REF_CAT_LEVEL 640
  ------------------
  |  Branch (187:7): [True: 1.39M, False: 854k]
  ------------------
  188|  1.39M|    return 0;
  189|       |
  190|  1.07M|  if (ref_mv_weight[ref_idx] >= REF_CAT_LEVEL &&
  ------------------
  |  |  512|  2.14M|#define REF_CAT_LEVEL 640
  ------------------
  |  Branch (190:7): [True: 854k, False: 220k]
  ------------------
  191|  1.07M|      ref_mv_weight[ref_idx + 1] < REF_CAT_LEVEL)
  ------------------
  |  |  512|   854k|#define REF_CAT_LEVEL 640
  ------------------
  |  Branch (191:7): [True: 854k, False: 18.4E]
  ------------------
  192|   854k|    return 1;
  193|       |
  194|   220k|  if (ref_mv_weight[ref_idx] < REF_CAT_LEVEL &&
  ------------------
  |  |  512|   440k|#define REF_CAT_LEVEL 640
  ------------------
  |  Branch (194:7): [True: 220k, False: 18.4E]
  ------------------
  195|   220k|      ref_mv_weight[ref_idx + 1] < REF_CAT_LEVEL)
  ------------------
  |  |  512|   220k|#define REF_CAT_LEVEL 640
  ------------------
  |  Branch (195:7): [True: 220k, False: 18.4E]
  ------------------
  196|   220k|    return 2;
  197|       |
  198|  18.4E|  return 0;
  199|   220k|}
decodemv.c:lower_mv_precision:
   88|  2.33M|static inline void lower_mv_precision(MV *mv, int allow_hp, int is_integer) {
   89|  2.33M|  if (is_integer) {
  ------------------
  |  Branch (89:7): [True: 226k, False: 2.10M]
  ------------------
   90|   226k|    integer_mv_precision(mv);
   91|  2.10M|  } else {
   92|  2.10M|    if (!allow_hp) {
  ------------------
  |  Branch (92:9): [True: 1.29M, False: 810k]
  ------------------
   93|  1.29M|      if (mv->row & 1) mv->row += (mv->row > 0 ? -1 : 1);
  ------------------
  |  Branch (93:11): [True: 0, False: 1.29M]
  |  Branch (93:36): [True: 0, False: 0]
  ------------------
   94|  1.29M|      if (mv->col & 1) mv->col += (mv->col > 0 ? -1 : 1);
  ------------------
  |  Branch (94:11): [True: 0, False: 1.29M]
  |  Branch (94:36): [True: 0, False: 0]
  ------------------
   95|  1.29M|    }
   96|  2.10M|  }
   97|  2.33M|}
decodemv.c:get_relative_dist:
   37|   674k|static inline int get_relative_dist(const OrderHintInfo *oh, int a, int b) {
   38|   674k|  if (!oh->enable_order_hint) return 0;
  ------------------
  |  Branch (38:7): [True: 0, False: 674k]
  ------------------
   39|       |
   40|   674k|  const int bits = oh->order_hint_bits_minus_1 + 1;
   41|       |
   42|   674k|  assert(bits >= 1);
   43|   674k|  assert(a >= 0 && a < (1 << bits));
   44|   674k|  assert(b >= 0 && b < (1 << bits));
   45|       |
   46|   674k|  int diff = a - b;
   47|   674k|  const int m = 1 << (bits - 1);
   48|   674k|  diff = (diff & (m - 1)) - (diff & m);
   49|   674k|  return diff;
   50|   674k|}
mvref_common.c:av1_set_ref_frame:
  153|  6.92M|                                     MV_REFERENCE_FRAME ref_frame_type) {
  154|  6.92M|  if (ref_frame_type >= REF_FRAMES) {
  ------------------
  |  Branch (154:7): [True: 2.03M, False: 4.88M]
  ------------------
  155|  2.03M|    rf[0] = ref_frame_map[ref_frame_type - REF_FRAMES][0];
  156|  2.03M|    rf[1] = ref_frame_map[ref_frame_type - REF_FRAMES][1];
  157|  4.88M|  } else {
  158|  4.88M|    assert(ref_frame_type > NONE_FRAME);
  159|  4.90M|    rf[0] = ref_frame_type;
  160|  4.90M|    rf[1] = NONE_FRAME;
  161|  4.90M|  }
  162|  6.92M|}
mvref_common.c:find_valid_row_offset:
   77|  4.39M|                                        int row_offset) {
   78|  4.39M|  return clamp(row_offset, tile->mi_row_start - mi_row,
   79|  4.39M|               tile->mi_row_end - mi_row - 1);
   80|  4.39M|}
mvref_common.c:find_valid_col_offset:
   83|  4.47M|                                        int col_offset) {
   84|  4.47M|  return clamp(col_offset, tile->mi_col_start - mi_col,
   85|  4.47M|               tile->mi_col_end - mi_col - 1);
   86|  4.47M|}
mvref_common.c:get_block_mv:
   62|  21.3M|static inline int_mv get_block_mv(const MB_MODE_INFO *candidate, int which_mv) {
   63|  21.3M|  return candidate->mv[which_mv];
   64|  21.3M|}
mvref_common.c:clamp_mv_ref:
   52|  13.3M|static inline void clamp_mv_ref(MV *mv, int bw, int bh, const MACROBLOCKD *xd) {
   53|  13.3M|  const SubpelMvLimits mv_limits = {
   54|  13.3M|    xd->mb_to_left_edge - GET_MV_SUBPEL(bw) - MV_BORDER,
  ------------------
  |  |   29|  13.3M|#define GET_MV_SUBPEL(x) ((x)*8)
  ------------------
                  xd->mb_to_left_edge - GET_MV_SUBPEL(bw) - MV_BORDER,
  ------------------
  |  |   35|  13.3M|#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
  ------------------
   55|  13.3M|    xd->mb_to_right_edge + GET_MV_SUBPEL(bw) + MV_BORDER,
  ------------------
  |  |   29|  13.3M|#define GET_MV_SUBPEL(x) ((x)*8)
  ------------------
                  xd->mb_to_right_edge + GET_MV_SUBPEL(bw) + MV_BORDER,
  ------------------
  |  |   35|  13.3M|#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
  ------------------
   56|  13.3M|    xd->mb_to_top_edge - GET_MV_SUBPEL(bh) - MV_BORDER,
  ------------------
  |  |   29|  13.3M|#define GET_MV_SUBPEL(x) ((x)*8)
  ------------------
                  xd->mb_to_top_edge - GET_MV_SUBPEL(bh) - MV_BORDER,
  ------------------
  |  |   35|  13.3M|#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
  ------------------
   57|  13.3M|    xd->mb_to_bottom_edge + GET_MV_SUBPEL(bh) + MV_BORDER
  ------------------
  |  |   29|  13.3M|#define GET_MV_SUBPEL(x) ((x)*8)
  ------------------
                  xd->mb_to_bottom_edge + GET_MV_SUBPEL(bh) + MV_BORDER
  ------------------
  |  |   35|  13.3M|#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
  ------------------
   58|  13.3M|  };
   59|  13.3M|  clamp_mv(mv, &mv_limits);
   60|  13.3M|}
mvref_common.c:lower_mv_precision:
   88|  8.77M|static inline void lower_mv_precision(MV *mv, int allow_hp, int is_integer) {
   89|  8.77M|  if (is_integer) {
  ------------------
  |  Branch (89:7): [True: 245k, False: 8.52M]
  ------------------
   90|   245k|    integer_mv_precision(mv);
   91|  8.52M|  } else {
   92|  8.52M|    if (!allow_hp) {
  ------------------
  |  Branch (92:9): [True: 5.66M, False: 2.86M]
  ------------------
   93|  5.66M|      if (mv->row & 1) mv->row += (mv->row > 0 ? -1 : 1);
  ------------------
  |  Branch (93:11): [True: 39.2k, False: 5.62M]
  |  Branch (93:36): [True: 14.3k, False: 24.9k]
  ------------------
   94|  5.66M|      if (mv->col & 1) mv->col += (mv->col > 0 ? -1 : 1);
  ------------------
  |  Branch (94:11): [True: 38.8k, False: 5.62M]
  |  Branch (94:36): [True: 10.2k, False: 28.6k]
  ------------------
   95|  5.66M|    }
   96|  8.52M|  }
   97|  8.77M|}
mvref_common.c:get_relative_dist:
   37|  4.67M|static inline int get_relative_dist(const OrderHintInfo *oh, int a, int b) {
   38|  4.67M|  if (!oh->enable_order_hint) return 0;
  ------------------
  |  Branch (38:7): [True: 0, False: 4.67M]
  ------------------
   39|       |
   40|  4.67M|  const int bits = oh->order_hint_bits_minus_1 + 1;
   41|       |
   42|  4.67M|  assert(bits >= 1);
   43|  4.67M|  assert(a >= 0 && a < (1 << bits));
   44|  4.68M|  assert(b >= 0 && b < (1 << bits));
   45|       |
   46|  4.68M|  int diff = a - b;
   47|  4.68M|  const int m = 1 << (bits - 1);
   48|  4.68M|  diff = (diff & (m - 1)) - (diff & m);
   49|  4.68M|  return diff;
   50|  4.68M|}
mvref_common.c:is_inside:
   69|  25.9M|                            const POSITION *mi_pos) {
   70|  25.9M|  return !(mi_row + mi_pos->row < tile->mi_row_start ||
  ------------------
  |  Branch (70:12): [True: 422k, False: 25.5M]
  ------------------
   71|  25.9M|           mi_col + mi_pos->col < tile->mi_col_start ||
  ------------------
  |  Branch (71:12): [True: 90.1k, False: 25.4M]
  ------------------
   72|  25.9M|           mi_row + mi_pos->row >= tile->mi_row_end ||
  ------------------
  |  Branch (72:12): [True: 93.5k, False: 25.3M]
  ------------------
   73|  25.9M|           mi_col + mi_pos->col >= tile->mi_col_end);
  ------------------
  |  Branch (73:12): [True: 76.7k, False: 25.2M]
  ------------------
   74|  25.9M|}
reconinter.c:get_relative_dist:
   37|  1.25M|static inline int get_relative_dist(const OrderHintInfo *oh, int a, int b) {
   38|  1.25M|  if (!oh->enable_order_hint) return 0;
  ------------------
  |  Branch (38:7): [True: 0, False: 1.25M]
  ------------------
   39|       |
   40|  1.25M|  const int bits = oh->order_hint_bits_minus_1 + 1;
   41|       |
   42|  1.25M|  assert(bits >= 1);
   43|  1.25M|  assert(a >= 0 && a < (1 << bits));
   44|  1.25M|  assert(b >= 0 && b < (1 << bits));
   45|       |
   46|  1.25M|  int diff = a - b;
   47|  1.25M|  const int m = 1 << (bits - 1);
   48|  1.25M|  diff = (diff & (m - 1)) - (diff & m);
   49|  1.25M|  return diff;
   50|  1.25M|}

decodeframe.c:foreach_overlappable_nb_above:
   23|   624k|                                                 void *fun_ctxt) {
   24|   624k|  if (!xd->up_available) return;
  ------------------
  |  Branch (24:7): [True: 0, False: 624k]
  ------------------
   25|       |
   26|   624k|  const int num_planes = av1_num_planes(cm);
   27|   624k|  int nb_count = 0;
   28|   624k|  const int mi_col = xd->mi_col;
   29|       |  // prev_row_mi points into the mi array, starting at the beginning of the
   30|       |  // previous row.
   31|   624k|  MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride;
   32|   624k|  const int end_col = AOMMIN(mi_col + xd->width, cm->mi_params.mi_cols);
  ------------------
  |  |   34|   624k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 615k, False: 8.31k]
  |  |  ------------------
  ------------------
   33|   624k|  uint8_t mi_step;
   34|  1.31M|  for (int above_mi_col = mi_col; above_mi_col < end_col && nb_count < nb_max;
  ------------------
  |  Branch (34:35): [True: 697k, False: 622k]
  |  Branch (34:61): [True: 695k, False: 1.63k]
  ------------------
   35|   695k|       above_mi_col += mi_step) {
   36|   695k|    MB_MODE_INFO **above_mi = prev_row_mi + above_mi_col;
   37|   695k|    mi_step =
   38|   695k|        AOMMIN(mi_size_wide[above_mi[0]->bsize], mi_size_wide[BLOCK_64X64]);
  ------------------
  |  |   34|   695k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 672k, False: 22.9k]
  |  |  ------------------
  ------------------
   39|       |    // If we're considering a block with width 4, it should be treated as
   40|       |    // half of a pair of blocks with chroma information in the second. Move
   41|       |    // above_mi_col back to the start of the pair if needed, set above_mbmi
   42|       |    // to point at the block with chroma information, and set mi_step to 2 to
   43|       |    // step over the entire pair at the end of the iteration.
   44|   695k|    if (mi_step == 1) {
  ------------------
  |  Branch (44:9): [True: 46.1k, False: 649k]
  ------------------
   45|  46.1k|      above_mi_col &= ~1;
   46|  46.1k|      above_mi = prev_row_mi + above_mi_col + 1;
   47|  46.1k|      mi_step = 2;
   48|  46.1k|    }
   49|   695k|    if (is_neighbor_overlappable(*above_mi)) {
  ------------------
  |  Branch (49:9): [True: 627k, False: 68.1k]
  ------------------
   50|   627k|      ++nb_count;
   51|   627k|      fun(xd, 0, above_mi_col - mi_col, AOMMIN(xd->width, mi_step), 0,
  ------------------
  |  |   34|   627k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 151k, False: 476k]
  |  |  ------------------
  ------------------
   52|   627k|          *above_mi, fun_ctxt, num_planes);
   53|   627k|    }
   54|   695k|  }
   55|   624k|}
decodeframe.c:foreach_overlappable_nb_left:
   60|   627k|                                                void *fun_ctxt) {
   61|   627k|  if (!xd->left_available) return;
  ------------------
  |  Branch (61:7): [True: 0, False: 627k]
  ------------------
   62|       |
   63|   627k|  const int num_planes = av1_num_planes(cm);
   64|   627k|  int nb_count = 0;
   65|       |  // prev_col_mi points into the mi array, starting at the top of the
   66|       |  // previous column
   67|   627k|  const int mi_row = xd->mi_row;
   68|   627k|  MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride;
   69|   627k|  const int end_row = AOMMIN(mi_row + xd->height, cm->mi_params.mi_rows);
  ------------------
  |  |   34|   627k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 621k, False: 5.98k]
  |  |  ------------------
  ------------------
   70|   627k|  uint8_t mi_step;
   71|  1.31M|  for (int left_mi_row = mi_row; left_mi_row < end_row && nb_count < nb_max;
  ------------------
  |  Branch (71:34): [True: 691k, False: 625k]
  |  Branch (71:59): [True: 689k, False: 2.09k]
  ------------------
   72|   689k|       left_mi_row += mi_step) {
   73|   689k|    MB_MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride;
   74|   689k|    mi_step =
   75|   689k|        AOMMIN(mi_size_high[left_mi[0]->bsize], mi_size_high[BLOCK_64X64]);
  ------------------
  |  |   34|   689k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 666k, False: 23.4k]
  |  |  ------------------
  ------------------
   76|   689k|    if (mi_step == 1) {
  ------------------
  |  Branch (76:9): [True: 59.4k, False: 630k]
  ------------------
   77|  59.4k|      left_mi_row &= ~1;
   78|  59.4k|      left_mi = prev_col_mi + (left_mi_row + 1) * xd->mi_stride;
   79|  59.4k|      mi_step = 2;
   80|  59.4k|    }
   81|   689k|    if (is_neighbor_overlappable(*left_mi)) {
  ------------------
  |  Branch (81:9): [True: 622k, False: 67.4k]
  ------------------
   82|   622k|      ++nb_count;
   83|   622k|      fun(xd, left_mi_row - mi_row, 0, AOMMIN(xd->height, mi_step), 1, *left_mi,
  ------------------
  |  |   34|   622k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 184k, False: 437k]
  |  |  ------------------
  ------------------
   84|   622k|          fun_ctxt, num_planes);
   85|   622k|    }
   86|   689k|  }
   87|   627k|}
reconinter.c:foreach_overlappable_nb_above:
   23|  3.65M|                                                 void *fun_ctxt) {
   24|  3.65M|  if (!xd->up_available) return;
  ------------------
  |  Branch (24:7): [True: 211k, False: 3.44M]
  ------------------
   25|       |
   26|  3.44M|  const int num_planes = av1_num_planes(cm);
   27|  3.44M|  int nb_count = 0;
   28|  3.44M|  const int mi_col = xd->mi_col;
   29|       |  // prev_row_mi points into the mi array, starting at the beginning of the
   30|       |  // previous row.
   31|  3.44M|  MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride;
   32|  3.44M|  const int end_col = AOMMIN(mi_col + xd->width, cm->mi_params.mi_cols);
  ------------------
  |  |   34|  3.44M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 3.35M, False: 94.9k]
  |  |  ------------------
  ------------------
   33|  3.44M|  uint8_t mi_step;
   34|  7.43M|  for (int above_mi_col = mi_col; above_mi_col < end_col && nb_count < nb_max;
  ------------------
  |  Branch (34:35): [True: 3.98M, False: 3.44M]
  |  Branch (34:61): [True: 3.99M, False: 18.4E]
  ------------------
   35|  3.99M|       above_mi_col += mi_step) {
   36|  3.99M|    MB_MODE_INFO **above_mi = prev_row_mi + above_mi_col;
   37|  3.99M|    mi_step =
   38|  3.99M|        AOMMIN(mi_size_wide[above_mi[0]->bsize], mi_size_wide[BLOCK_64X64]);
  ------------------
  |  |   34|  3.99M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 3.64M, False: 342k]
  |  |  ------------------
  ------------------
   39|       |    // If we're considering a block with width 4, it should be treated as
   40|       |    // half of a pair of blocks with chroma information in the second. Move
   41|       |    // above_mi_col back to the start of the pair if needed, set above_mbmi
   42|       |    // to point at the block with chroma information, and set mi_step to 2 to
   43|       |    // step over the entire pair at the end of the iteration.
   44|  3.99M|    if (mi_step == 1) {
  ------------------
  |  Branch (44:9): [True: 251k, False: 3.73M]
  ------------------
   45|   251k|      above_mi_col &= ~1;
   46|   251k|      above_mi = prev_row_mi + above_mi_col + 1;
   47|   251k|      mi_step = 2;
   48|   251k|    }
   49|  3.99M|    if (is_neighbor_overlappable(*above_mi)) {
  ------------------
  |  Branch (49:9): [True: 3.53M, False: 452k]
  ------------------
   50|  3.53M|      ++nb_count;
   51|  3.53M|      fun(xd, 0, above_mi_col - mi_col, AOMMIN(xd->width, mi_step), 0,
  ------------------
  |  |   34|  3.53M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 802k, False: 2.73M]
  |  |  ------------------
  ------------------
   52|  3.53M|          *above_mi, fun_ctxt, num_planes);
   53|  3.53M|    }
   54|  3.99M|  }
   55|  3.44M|}
reconinter.c:foreach_overlappable_nb_left:
   60|  1.13M|                                                void *fun_ctxt) {
   61|  1.13M|  if (!xd->left_available) return;
  ------------------
  |  Branch (61:7): [True: 56.9k, False: 1.07M]
  ------------------
   62|       |
   63|  1.07M|  const int num_planes = av1_num_planes(cm);
   64|  1.07M|  int nb_count = 0;
   65|       |  // prev_col_mi points into the mi array, starting at the top of the
   66|       |  // previous column
   67|  1.07M|  const int mi_row = xd->mi_row;
   68|  1.07M|  MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride;
   69|  1.07M|  const int end_row = AOMMIN(mi_row + xd->height, cm->mi_params.mi_rows);
  ------------------
  |  |   34|  1.07M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.05M, False: 21.8k]
  |  |  ------------------
  ------------------
   70|  1.07M|  uint8_t mi_step;
   71|  2.33M|  for (int left_mi_row = mi_row; left_mi_row < end_row && nb_count < nb_max;
  ------------------
  |  Branch (71:34): [True: 1.26M, False: 1.07M]
  |  Branch (71:59): [True: 1.25M, False: 1.70k]
  ------------------
   72|  1.25M|       left_mi_row += mi_step) {
   73|  1.25M|    MB_MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride;
   74|  1.25M|    mi_step =
   75|  1.25M|        AOMMIN(mi_size_high[left_mi[0]->bsize], mi_size_high[BLOCK_64X64]);
  ------------------
  |  |   34|  1.25M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.07M, False: 179k]
  |  |  ------------------
  ------------------
   76|  1.25M|    if (mi_step == 1) {
  ------------------
  |  Branch (76:9): [True: 89.0k, False: 1.17M]
  ------------------
   77|  89.0k|      left_mi_row &= ~1;
   78|  89.0k|      left_mi = prev_col_mi + (left_mi_row + 1) * xd->mi_stride;
   79|  89.0k|      mi_step = 2;
   80|  89.0k|    }
   81|  1.25M|    if (is_neighbor_overlappable(*left_mi)) {
  ------------------
  |  Branch (81:9): [True: 1.09M, False: 167k]
  ------------------
   82|  1.09M|      ++nb_count;
   83|  1.09M|      fun(xd, left_mi_row - mi_row, 0, AOMMIN(xd->height, mi_step), 1, *left_mi,
  ------------------
  |  |   34|  1.09M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 266k, False: 825k]
  |  |  ------------------
  ------------------
   84|  1.09M|          fun_ctxt, num_planes);
   85|  1.09M|    }
   86|  1.25M|  }
   87|  1.07M|}

aom_read_obu_header_and_size:
   95|  1.49M|                                             size_t *const bytes_read) {
   96|  1.49M|  size_t length_field_size_obu = 0;
   97|  1.49M|  size_t length_field_size_payload = 0;
   98|  1.49M|  size_t obu_size = 0;
   99|  1.49M|  aom_codec_err_t status;
  100|       |
  101|  1.49M|  if (is_annexb) {
  ------------------
  |  Branch (101:7): [True: 98.0k, False: 1.39M]
  ------------------
  102|       |    // Size field comes before the OBU header, and includes the OBU header
  103|  98.0k|    status =
  104|  98.0k|        read_obu_size(data, bytes_available, &obu_size, &length_field_size_obu);
  105|       |
  106|  98.0k|    if (status != AOM_CODEC_OK) return status;
  ------------------
  |  Branch (106:9): [True: 570, False: 97.5k]
  ------------------
  107|  98.0k|  }
  108|       |
  109|  1.49M|  struct aom_read_bit_buffer rb = { data + length_field_size_obu,
  110|  1.49M|                                    data + bytes_available, 0, NULL, NULL };
  111|       |
  112|  1.49M|  status = read_obu_header(&rb, is_annexb, obu_header);
  113|  1.49M|  if (status != AOM_CODEC_OK) return status;
  ------------------
  |  Branch (113:7): [True: 226k, False: 1.26M]
  ------------------
  114|       |
  115|  1.26M|  if (!obu_header->has_size_field) {
  ------------------
  |  Branch (115:7): [True: 65.9k, False: 1.20M]
  ------------------
  116|  65.9k|    assert(is_annexb);
  117|       |    // Derive the payload size from the data we've already read
  118|  65.9k|    if (obu_size < obu_header->size) return AOM_CODEC_CORRUPT_FRAME;
  ------------------
  |  Branch (118:9): [True: 1.58k, False: 64.3k]
  ------------------
  119|       |
  120|  64.3k|    *payload_size = obu_size - obu_header->size;
  121|  1.20M|  } else {
  122|       |    // Size field comes after the OBU header, and is just the payload size
  123|  1.20M|    status = read_obu_size(
  124|  1.20M|        data + length_field_size_obu + obu_header->size,
  125|  1.20M|        bytes_available - length_field_size_obu - obu_header->size,
  126|  1.20M|        payload_size, &length_field_size_payload);
  127|  1.20M|    if (status != AOM_CODEC_OK) return status;
  ------------------
  |  Branch (127:9): [True: 3.08k, False: 1.19M]
  ------------------
  128|  1.20M|  }
  129|       |
  130|  1.26M|  *bytes_read =
  131|  1.26M|      length_field_size_obu + obu_header->size + length_field_size_payload;
  132|  1.26M|  return AOM_CODEC_OK;
  133|  1.26M|}
obu_util.c:read_obu_header:
   34|  1.49M|                                       int is_annexb, ObuHeader *header) {
   35|  1.49M|  if (!rb || !header) return AOM_CODEC_INVALID_PARAM;
  ------------------
  |  Branch (35:7): [True: 0, False: 1.49M]
  |  Branch (35:14): [True: 0, False: 1.49M]
  ------------------
   36|       |
   37|  1.49M|  const ptrdiff_t bit_buffer_byte_length = rb->bit_buffer_end - rb->bit_buffer;
   38|  1.49M|  if (bit_buffer_byte_length < 1) return AOM_CODEC_CORRUPT_FRAME;
  ------------------
  |  Branch (38:7): [True: 389, False: 1.49M]
  ------------------
   39|       |
   40|  1.49M|  header->size = 1;
   41|       |
   42|  1.49M|  if (aom_rb_read_bit(rb) != 0) {
  ------------------
  |  Branch (42:7): [True: 16.2k, False: 1.47M]
  ------------------
   43|       |    // Forbidden bit. Must not be set.
   44|  16.2k|    return AOM_CODEC_CORRUPT_FRAME;
   45|  16.2k|  }
   46|       |
   47|  1.47M|  header->type = (OBU_TYPE)aom_rb_read_literal(rb, 4);
   48|  1.47M|  header->has_extension = aom_rb_read_bit(rb);
   49|  1.47M|  header->has_size_field = aom_rb_read_bit(rb);
   50|       |
   51|  1.47M|  if (!header->has_size_field && !is_annexb) {
  ------------------
  |  Branch (51:7): [True: 275k, False: 1.20M]
  |  Branch (51:34): [True: 209k, False: 66.0k]
  ------------------
   52|       |    // section 5 obu streams must have obu_size field set.
   53|   209k|    return AOM_CODEC_UNSUP_BITSTREAM;
   54|   209k|  }
   55|       |
   56|       |  // obu_reserved_1bit must be set to 0. The value is ignored by a decoder.
   57|  1.26M|  aom_rb_read_bit(rb);
   58|       |
   59|  1.26M|  if (header->has_extension) {
  ------------------
  |  Branch (59:7): [True: 185k, False: 1.08M]
  ------------------
   60|   185k|    if (bit_buffer_byte_length == 1) return AOM_CODEC_CORRUPT_FRAME;
  ------------------
  |  Branch (60:9): [True: 335, False: 185k]
  ------------------
   61|       |
   62|   185k|    header->size += 1;
   63|   185k|    header->temporal_layer_id = aom_rb_read_literal(rb, 3);
   64|   185k|    header->spatial_layer_id = aom_rb_read_literal(rb, 2);
   65|       |    // extension_header_reserved_3bits must be set to 0. The value is ignored by
   66|       |    // a decoder.
   67|   185k|    aom_rb_read_literal(rb, 3);
   68|  1.08M|  } else {
   69|  1.08M|    header->temporal_layer_id = 0;
   70|  1.08M|    header->spatial_layer_id = 0;
   71|  1.08M|  }
   72|       |
   73|  1.26M|  return AOM_CODEC_OK;
   74|  1.26M|}
obu_util.c:read_obu_size:
   20|  1.29M|                                     size_t *const length_field_size) {
   21|  1.29M|  uint64_t u_obu_size = 0;
   22|  1.29M|  if (aom_uleb_decode(data, bytes_available, &u_obu_size, length_field_size) !=
  ------------------
  |  Branch (22:7): [True: 3.65k, False: 1.29M]
  ------------------
   23|  1.29M|      0) {
   24|  3.65k|    return AOM_CODEC_CORRUPT_FRAME;
   25|  3.65k|  }
   26|       |
   27|  1.29M|  if (u_obu_size > UINT32_MAX) return AOM_CODEC_CORRUPT_FRAME;
  ------------------
  |  Branch (27:7): [True: 0, False: 1.29M]
  ------------------
   28|  1.29M|  *obu_size = (size_t)u_obu_size;
   29|  1.29M|  return AOM_CODEC_OK;
   30|  1.29M|}

av1_get_pred_context_switchable_interp:
   30|  3.10M|int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir) {
   31|  3.10M|  const MB_MODE_INFO *const mbmi = xd->mi[0];
   32|  3.10M|  const int ctx_offset =
   33|  3.10M|      (mbmi->ref_frame[1] > INTRA_FRAME) * INTER_FILTER_COMP_OFFSET;
  ------------------
  |  |  101|  3.10M|#define INTER_FILTER_COMP_OFFSET (SWITCHABLE_FILTERS + 1)
  ------------------
   34|  3.10M|  assert(dir == 0 || dir == 1);
   35|  3.10M|  const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0];
   36|       |  // Note:
   37|       |  // The mode info data structure has a one element border above and to the
   38|       |  // left of the entries corresponding to real macroblocks.
   39|       |  // The prediction flags in these dummy entries are initialized to 0.
   40|  3.10M|  int filter_type_ctx = ctx_offset + (dir & 0x01) * INTER_FILTER_DIR_OFFSET;
  ------------------
  |  |  102|  3.10M|#define INTER_FILTER_DIR_OFFSET ((SWITCHABLE_FILTERS + 1) * 2)
  ------------------
   41|  3.10M|  int left_type = SWITCHABLE_FILTERS;
   42|  3.10M|  int above_type = SWITCHABLE_FILTERS;
   43|       |
   44|  3.10M|  if (xd->left_available)
  ------------------
  |  Branch (44:7): [True: 3.03M, False: 65.7k]
  ------------------
   45|  3.03M|    left_type = get_ref_filter_type(xd->mi[-1], xd, dir, ref_frame);
   46|       |
   47|  3.10M|  if (xd->up_available)
  ------------------
  |  Branch (47:7): [True: 3.01M, False: 85.8k]
  ------------------
   48|  3.01M|    above_type =
   49|  3.01M|        get_ref_filter_type(xd->mi[-xd->mi_stride], xd, dir, ref_frame);
   50|       |
   51|  3.10M|  if (left_type == above_type) {
  ------------------
  |  Branch (51:7): [True: 2.03M, False: 1.06M]
  ------------------
   52|  2.03M|    filter_type_ctx += left_type;
   53|  2.03M|  } else if (left_type == SWITCHABLE_FILTERS) {
  ------------------
  |  Branch (53:14): [True: 483k, False: 583k]
  ------------------
   54|   483k|    assert(above_type != SWITCHABLE_FILTERS);
   55|   483k|    filter_type_ctx += above_type;
   56|   583k|  } else if (above_type == SWITCHABLE_FILTERS) {
  ------------------
  |  Branch (56:14): [True: 498k, False: 85.3k]
  ------------------
   57|   498k|    assert(left_type != SWITCHABLE_FILTERS);
   58|   498k|    filter_type_ctx += left_type;
   59|   498k|  } else {
   60|  85.3k|    filter_type_ctx += SWITCHABLE_FILTERS;
   61|  85.3k|  }
   62|       |
   63|  3.10M|  return filter_type_ctx;
   64|  3.10M|}
av1_get_palette_cache:
   74|   161k|                          uint16_t *cache) {
   75|   161k|  const int row = -xd->mb_to_top_edge >> 3;
   76|       |  // Do not refer to above SB row when on SB boundary.
   77|   161k|  const MB_MODE_INFO *const above_mi =
   78|   161k|      (row % (1 << MIN_SB_SIZE_LOG2)) ? xd->above_mbmi : NULL;
  ------------------
  |  |   36|   161k|#define MIN_SB_SIZE_LOG2 6
  ------------------
  |  Branch (78:7): [True: 135k, False: 26.2k]
  ------------------
   79|   161k|  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
   80|   161k|  int above_n = 0, left_n = 0;
   81|   161k|  if (above_mi) above_n = above_mi->palette_mode_info.palette_size[plane != 0];
  ------------------
  |  Branch (81:7): [True: 135k, False: 26.2k]
  ------------------
   82|   161k|  if (left_mi) left_n = left_mi->palette_mode_info.palette_size[plane != 0];
  ------------------
  |  Branch (82:7): [True: 140k, False: 20.9k]
  ------------------
   83|   161k|  if (above_n == 0 && left_n == 0) return 0;
  ------------------
  |  Branch (83:7): [True: 114k, False: 46.8k]
  |  Branch (83:23): [True: 83.4k, False: 31.4k]
  ------------------
   84|  78.2k|  int above_idx = plane * PALETTE_MAX_SIZE;
  ------------------
  |  |   63|  78.2k|#define PALETTE_MAX_SIZE 8
  ------------------
   85|  78.2k|  int left_idx = plane * PALETTE_MAX_SIZE;
  ------------------
  |  |   63|  78.2k|#define PALETTE_MAX_SIZE 8
  ------------------
   86|  78.2k|  int n = 0;
   87|  78.2k|  const uint16_t *above_colors =
   88|  78.2k|      above_mi ? above_mi->palette_mode_info.palette_colors : NULL;
  ------------------
  |  Branch (88:7): [True: 69.6k, False: 8.66k]
  ------------------
   89|  78.2k|  const uint16_t *left_colors =
   90|  78.2k|      left_mi ? left_mi->palette_mode_info.palette_colors : NULL;
  ------------------
  |  Branch (90:7): [True: 69.6k, False: 8.60k]
  ------------------
   91|       |  // Merge the sorted lists of base colors from above and left to get
   92|       |  // combined sorted color cache.
   93|   227k|  while (above_n > 0 && left_n > 0) {
  ------------------
  |  Branch (93:10): [True: 183k, False: 43.8k]
  |  Branch (93:25): [True: 149k, False: 34.4k]
  ------------------
   94|   149k|    uint16_t v_above = above_colors[above_idx];
   95|   149k|    uint16_t v_left = left_colors[left_idx];
   96|   149k|    if (v_left < v_above) {
  ------------------
  |  Branch (96:9): [True: 60.3k, False: 88.7k]
  ------------------
   97|  60.3k|      palette_add_to_cache(cache, &n, v_left);
   98|  60.3k|      ++left_idx, --left_n;
   99|  88.7k|    } else {
  100|  88.7k|      palette_add_to_cache(cache, &n, v_above);
  101|  88.7k|      ++above_idx, --above_n;
  102|  88.7k|      if (v_left == v_above) ++left_idx, --left_n;
  ------------------
  |  Branch (102:11): [True: 37.3k, False: 51.3k]
  ------------------
  103|  88.7k|    }
  104|   149k|  }
  105|   223k|  while (above_n-- > 0) {
  ------------------
  |  Branch (105:10): [True: 145k, False: 78.2k]
  ------------------
  106|   145k|    uint16_t val = above_colors[above_idx++];
  107|   145k|    palette_add_to_cache(cache, &n, val);
  108|   145k|  }
  109|   225k|  while (left_n-- > 0) {
  ------------------
  |  Branch (109:10): [True: 147k, False: 78.2k]
  ------------------
  110|   147k|    uint16_t val = left_colors[left_idx++];
  111|   147k|    palette_add_to_cache(cache, &n, val);
  112|   147k|  }
  113|  78.2k|  assert(n <= 2 * PALETTE_MAX_SIZE);
  114|  78.3k|  return n;
  115|  78.2k|}
av1_get_intra_inter_context:
  124|  5.70M|int av1_get_intra_inter_context(const MACROBLOCKD *xd) {
  125|  5.70M|  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
  126|  5.70M|  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
  127|  5.70M|  const int has_above = xd->up_available;
  128|  5.70M|  const int has_left = xd->left_available;
  129|       |
  130|  5.70M|  if (has_above && has_left) {  // both edges available
  ------------------
  |  Branch (130:7): [True: 5.54M, False: 163k]
  |  Branch (130:20): [True: 5.47M, False: 69.8k]
  ------------------
  131|  5.47M|    const int above_intra = !is_inter_block(above_mbmi);
  132|  5.47M|    const int left_intra = !is_inter_block(left_mbmi);
  133|  5.47M|    return left_intra && above_intra ? 3 : left_intra || above_intra;
  ------------------
  |  Branch (133:12): [True: 1.74M, False: 3.72M]
  |  Branch (133:26): [True: 1.20M, False: 535k]
  |  Branch (133:44): [True: 535k, False: 3.72M]
  |  Branch (133:58): [True: 512k, False: 3.21M]
  ------------------
  134|  5.47M|  } else if (has_above || has_left) {  // one edge available
  ------------------
  |  Branch (134:14): [True: 68.8k, False: 164k]
  |  Branch (134:27): [True: 131k, False: 33.3k]
  ------------------
  135|   201k|    return 2 * !is_inter_block(has_above ? above_mbmi : left_mbmi);
  ------------------
  |  Branch (135:32): [True: 70.7k, False: 131k]
  ------------------
  136|   201k|  } else {
  137|  31.3k|    return 0;
  138|  31.3k|  }
  139|  5.70M|}
av1_get_reference_mode_context:
  145|   989k|int av1_get_reference_mode_context(const MACROBLOCKD *xd) {
  146|   989k|  int ctx;
  147|   989k|  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
  148|   989k|  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
  149|   989k|  const int has_above = xd->up_available;
  150|   989k|  const int has_left = xd->left_available;
  151|       |
  152|       |  // Note:
  153|       |  // The mode info data structure has a one element border above and to the
  154|       |  // left of the entries corresponding to real macroblocks.
  155|       |  // The prediction flags in these dummy entries are initialized to 0.
  156|   989k|  if (has_above && has_left) {  // both edges available
  ------------------
  |  Branch (156:7): [True: 907k, False: 82.0k]
  |  Branch (156:20): [True: 879k, False: 28.2k]
  ------------------
  157|   879k|    if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi))
  ------------------
  |  Branch (157:9): [True: 494k, False: 385k]
  |  Branch (157:40): [True: 349k, False: 144k]
  ------------------
  158|       |      // neither edge uses comp pred (0/1)
  159|   350k|      ctx = IS_BACKWARD_REF_FRAME(above_mbmi->ref_frame[0]) ^
  ------------------
  |  |  143|   350k|#define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame)
  |  |  ------------------
  |  |  |  |  142|   350k|  (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (142:4): [True: 71.1k, False: 278k]
  |  |  |  |  |  Branch (142:37): [True: 71.1k, False: 18.4E]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  160|   350k|            IS_BACKWARD_REF_FRAME(left_mbmi->ref_frame[0]);
  ------------------
  |  |  143|   350k|#define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame)
  |  |  ------------------
  |  |  |  |  142|   350k|  (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (142:4): [True: 71.3k, False: 278k]
  |  |  |  |  |  Branch (142:37): [True: 71.3k, False: 18.4E]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  161|   529k|    else if (!has_second_ref(above_mbmi))
  ------------------
  |  Branch (161:14): [True: 144k, False: 385k]
  ------------------
  162|       |      // one of two edges uses comp pred (2/3)
  163|   144k|      ctx = 2 + (IS_BACKWARD_REF_FRAME(above_mbmi->ref_frame[0]) ||
  ------------------
  |  |  143|   144k|#define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame)
  |  |  ------------------
  |  |  |  |  142|   288k|  (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (142:4): [True: 37.3k, False: 106k]
  |  |  |  |  |  Branch (142:37): [True: 37.3k, False: 18.4E]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  164|   144k|                 !is_inter_block(above_mbmi));
  ------------------
  |  Branch (164:18): [True: 26.2k, False: 80.4k]
  ------------------
  165|   385k|    else if (!has_second_ref(left_mbmi))
  ------------------
  |  Branch (165:14): [True: 137k, False: 248k]
  ------------------
  166|       |      // one of two edges uses comp pred (2/3)
  167|   137k|      ctx = 2 + (IS_BACKWARD_REF_FRAME(left_mbmi->ref_frame[0]) ||
  ------------------
  |  |  143|   137k|#define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame)
  |  |  ------------------
  |  |  |  |  142|   274k|  (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (142:4): [True: 34.5k, False: 102k]
  |  |  |  |  |  Branch (142:37): [True: 34.5k, False: 18.4E]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  168|   137k|                 !is_inter_block(left_mbmi));
  ------------------
  |  Branch (168:18): [True: 26.0k, False: 76.5k]
  ------------------
  169|   248k|    else  // both edges use comp pred (4)
  170|   248k|      ctx = 4;
  171|   879k|  } else if (has_above || has_left) {  // one edge available
  ------------------
  |  Branch (171:14): [True: 27.4k, False: 82.8k]
  |  Branch (171:27): [True: 67.7k, False: 15.1k]
  ------------------
  172|  96.2k|    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
  ------------------
  |  Branch (172:37): [True: 28.5k, False: 67.7k]
  ------------------
  173|       |
  174|  96.2k|    if (!has_second_ref(edge_mbmi))
  ------------------
  |  Branch (174:9): [True: 44.7k, False: 51.5k]
  ------------------
  175|       |      // edge does not use comp pred (0/1)
  176|  44.7k|      ctx = IS_BACKWARD_REF_FRAME(edge_mbmi->ref_frame[0]);
  ------------------
  |  |  143|  44.7k|#define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame)
  |  |  ------------------
  |  |  |  |  142|  44.7k|  (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (142:4): [True: 16.2k, False: 28.4k]
  |  |  |  |  |  Branch (142:37): [True: 16.2k, False: 1]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  177|  51.5k|    else
  178|       |      // edge uses comp pred (3)
  179|  51.5k|      ctx = 3;
  180|  96.2k|  } else {  // no edges available (1)
  181|  13.9k|    ctx = 1;
  182|  13.9k|  }
  183|   989k|  assert(ctx >= 0 && ctx < COMP_INTER_CONTEXTS);
  184|   989k|  return ctx;
  185|   989k|}
av1_get_comp_reference_type_context:
  187|   524k|int av1_get_comp_reference_type_context(const MACROBLOCKD *xd) {
  188|   524k|  int pred_context;
  189|   524k|  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
  190|   524k|  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
  191|   524k|  const int above_in_image = xd->up_available;
  192|   524k|  const int left_in_image = xd->left_available;
  193|       |
  194|   524k|  if (above_in_image && left_in_image) {  // both edges available
  ------------------
  |  Branch (194:7): [True: 476k, False: 48.3k]
  |  Branch (194:25): [True: 463k, False: 12.9k]
  ------------------
  195|   463k|    const int above_intra = !is_inter_block(above_mbmi);
  196|   463k|    const int left_intra = !is_inter_block(left_mbmi);
  197|       |
  198|   463k|    if (above_intra && left_intra) {  // intra/intra
  ------------------
  |  Branch (198:9): [True: 35.2k, False: 428k]
  |  Branch (198:24): [True: 6.59k, False: 28.6k]
  ------------------
  199|  6.59k|      pred_context = 2;
  200|   456k|    } else if (above_intra || left_intra) {  // intra/inter
  ------------------
  |  Branch (200:16): [True: 28.7k, False: 428k]
  |  Branch (200:31): [True: 28.4k, False: 399k]
  ------------------
  201|  57.0k|      const MB_MODE_INFO *inter_mbmi = above_intra ? left_mbmi : above_mbmi;
  ------------------
  |  Branch (201:40): [True: 28.6k, False: 28.4k]
  ------------------
  202|       |
  203|  57.0k|      if (!has_second_ref(inter_mbmi))  // single pred
  ------------------
  |  Branch (203:11): [True: 23.1k, False: 33.9k]
  ------------------
  204|  23.1k|        pred_context = 2;
  205|  33.9k|      else  // comp pred
  206|  33.9k|        pred_context = 1 + 2 * has_uni_comp_refs(inter_mbmi);
  207|   399k|    } else {  // inter/inter
  208|   399k|      const int a_sg = !has_second_ref(above_mbmi);
  209|   399k|      const int l_sg = !has_second_ref(left_mbmi);
  210|   399k|      const MV_REFERENCE_FRAME frfa = above_mbmi->ref_frame[0];
  211|   399k|      const MV_REFERENCE_FRAME frfl = left_mbmi->ref_frame[0];
  212|       |
  213|   399k|      if (a_sg && l_sg) {  // single/single
  ------------------
  |  Branch (213:11): [True: 117k, False: 281k]
  |  Branch (213:19): [True: 47.9k, False: 69.9k]
  ------------------
  214|  47.9k|        pred_context = 1 + 2 * (!(IS_BACKWARD_REF_FRAME(frfa) ^
  ------------------
  |  |  143|  47.9k|#define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame)
  |  |  ------------------
  |  |  |  |  142|  47.9k|  (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (142:4): [True: 10.1k, False: 37.7k]
  |  |  |  |  |  Branch (142:37): [True: 10.1k, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  215|  47.9k|                                  IS_BACKWARD_REF_FRAME(frfl)));
  ------------------
  |  |  143|  47.9k|#define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame)
  |  |  ------------------
  |  |  |  |  142|  47.9k|  (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (142:4): [True: 12.0k, False: 35.9k]
  |  |  |  |  |  Branch (142:37): [True: 12.0k, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  216|   351k|      } else if (l_sg || a_sg) {  // single/comp
  ------------------
  |  Branch (216:18): [True: 62.3k, False: 289k]
  |  Branch (216:26): [True: 69.8k, False: 219k]
  ------------------
  217|   132k|        const int uni_rfc =
  218|   132k|            a_sg ? has_uni_comp_refs(left_mbmi) : has_uni_comp_refs(above_mbmi);
  ------------------
  |  Branch (218:13): [True: 69.9k, False: 62.2k]
  ------------------
  219|       |
  220|   132k|        if (!uni_rfc)  // comp bidir
  ------------------
  |  Branch (220:13): [True: 108k, False: 23.7k]
  ------------------
  221|   108k|          pred_context = 1;
  222|  23.7k|        else  // comp unidir
  223|  23.7k|          pred_context = 3 + (!(IS_BACKWARD_REF_FRAME(frfa) ^
  ------------------
  |  |  143|  23.7k|#define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame)
  |  |  ------------------
  |  |  |  |  142|  23.7k|  (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (142:4): [True: 7.63k, False: 16.1k]
  |  |  |  |  |  Branch (142:37): [True: 7.63k, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  224|  23.7k|                                IS_BACKWARD_REF_FRAME(frfl)));
  ------------------
  |  |  143|  23.7k|#define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame)
  |  |  ------------------
  |  |  |  |  142|  23.7k|  (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME))
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (142:4): [True: 7.27k, False: 16.4k]
  |  |  |  |  |  Branch (142:37): [True: 7.27k, False: 0]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  225|   219k|      } else {  // comp/comp
  226|   219k|        const int a_uni_rfc = has_uni_comp_refs(above_mbmi);
  227|   219k|        const int l_uni_rfc = has_uni_comp_refs(left_mbmi);
  228|       |
  229|   219k|        if (!a_uni_rfc && !l_uni_rfc)  // bidir/bidir
  ------------------
  |  Branch (229:13): [True: 185k, False: 34.4k]
  |  Branch (229:27): [True: 173k, False: 11.5k]
  ------------------
  230|   173k|          pred_context = 0;
  231|  45.9k|        else if (!a_uni_rfc || !l_uni_rfc)  // unidir/bidir
  ------------------
  |  Branch (231:18): [True: 11.5k, False: 34.4k]
  |  Branch (231:32): [True: 20.1k, False: 14.2k]
  ------------------
  232|  31.7k|          pred_context = 2;
  233|  14.2k|        else  // unidir/unidir
  234|  14.2k|          pred_context =
  235|  14.2k|              3 + (!((frfa == BWDREF_FRAME) ^ (frfl == BWDREF_FRAME)));
  236|   219k|      }
  237|   399k|    }
  238|   463k|  } else if (above_in_image || left_in_image) {  // one edge available
  ------------------
  |  Branch (238:14): [True: 12.6k, False: 48.5k]
  |  Branch (238:32): [True: 37.3k, False: 11.2k]
  ------------------
  239|  50.3k|    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
  ------------------
  |  Branch (239:37): [True: 13.0k, False: 37.3k]
  ------------------
  240|       |
  241|  50.3k|    if (!is_inter_block(edge_mbmi)) {  // intra
  ------------------
  |  Branch (241:9): [True: 863, False: 49.5k]
  ------------------
  242|    863|      pred_context = 2;
  243|  49.5k|    } else {                           // inter
  244|  49.5k|      if (!has_second_ref(edge_mbmi))  // single pred
  ------------------
  |  Branch (244:11): [True: 8.57k, False: 40.9k]
  ------------------
  245|  8.57k|        pred_context = 2;
  246|  40.9k|      else  // comp pred
  247|  40.9k|        pred_context = 4 * has_uni_comp_refs(edge_mbmi);
  248|  49.5k|    }
  249|  50.3k|  } else {  // no edges available
  250|  10.8k|    pred_context = 2;
  251|  10.8k|  }
  252|       |
  253|   524k|  assert(pred_context >= 0 && pred_context < COMP_REF_TYPE_CONTEXTS);
  254|   524k|  return pred_context;
  255|   524k|}
av1_get_pred_context_uni_comp_ref_p:
  265|  87.2k|int av1_get_pred_context_uni_comp_ref_p(const MACROBLOCKD *xd) {
  266|  87.2k|  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
  267|       |
  268|       |  // Count of forward references (L, L2, L3, or G)
  269|  87.2k|  const int frf_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME] +
  270|  87.2k|                        ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
  271|       |  // Count of backward references (B or A)
  272|  87.2k|  const int brf_count = ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME] +
  273|  87.2k|                        ref_counts[ALTREF_FRAME];
  274|       |
  275|  87.2k|  const int pred_context =
  276|  87.2k|      (frf_count == brf_count) ? 1 : ((frf_count < brf_count) ? 0 : 2);
  ------------------
  |  Branch (276:7): [True: 11.0k, False: 76.2k]
  |  Branch (276:39): [True: 20.0k, False: 56.1k]
  ------------------
  277|       |
  278|  87.2k|  assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS);
  279|  87.2k|  return pred_context;
  280|  87.2k|}
av1_get_pred_context_uni_comp_ref_p1:
  290|  65.9k|int av1_get_pred_context_uni_comp_ref_p1(const MACROBLOCKD *xd) {
  291|  65.9k|  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
  292|       |
  293|       |  // Count of LAST2
  294|  65.9k|  const int last2_count = ref_counts[LAST2_FRAME];
  295|       |  // Count of LAST3 or GOLDEN
  296|  65.9k|  const int last3_or_gld_count =
  297|  65.9k|      ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
  298|       |
  299|  65.9k|  const int pred_context = (last2_count == last3_or_gld_count)
  ------------------
  |  Branch (299:28): [True: 29.4k, False: 36.5k]
  ------------------
  300|  65.9k|                               ? 1
  301|  65.9k|                               : ((last2_count < last3_or_gld_count) ? 0 : 2);
  ------------------
  |  Branch (301:35): [True: 24.4k, False: 12.0k]
  ------------------
  302|       |
  303|  65.9k|  assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS);
  304|  65.9k|  return pred_context;
  305|  65.9k|}
av1_get_pred_context_uni_comp_ref_p2:
  315|  39.6k|int av1_get_pred_context_uni_comp_ref_p2(const MACROBLOCKD *xd) {
  316|  39.6k|  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
  317|       |
  318|       |  // Count of LAST3
  319|  39.6k|  const int last3_count = ref_counts[LAST3_FRAME];
  320|       |  // Count of GOLDEN
  321|  39.6k|  const int gld_count = ref_counts[GOLDEN_FRAME];
  322|       |
  323|  39.6k|  const int pred_context =
  324|  39.6k|      (last3_count == gld_count) ? 1 : ((last3_count < gld_count) ? 0 : 2);
  ------------------
  |  Branch (324:7): [True: 19.5k, False: 20.0k]
  |  Branch (324:41): [True: 11.8k, False: 8.26k]
  ------------------
  325|       |
  326|  39.6k|  assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS);
  327|  39.6k|  return pred_context;
  328|  39.6k|}
av1_get_pred_context_comp_ref_p:
  421|   437k|int av1_get_pred_context_comp_ref_p(const MACROBLOCKD *xd) {
  422|   437k|  return get_pred_context_ll2_or_l3gld(xd);
  423|   437k|}
av1_get_pred_context_comp_ref_p1:
  428|   335k|int av1_get_pred_context_comp_ref_p1(const MACROBLOCKD *xd) {
  429|   335k|  return get_pred_context_last_or_last2(xd);
  430|   335k|}
av1_get_pred_context_comp_ref_p2:
  435|   101k|int av1_get_pred_context_comp_ref_p2(const MACROBLOCKD *xd) {
  436|   101k|  return get_pred_context_last3_or_gld(xd);
  437|   101k|}
av1_get_pred_context_comp_bwdref_p:
  441|   437k|int av1_get_pred_context_comp_bwdref_p(const MACROBLOCKD *xd) {
  442|   437k|  return get_pred_context_brfarf2_or_arf(xd);
  443|   437k|}
av1_get_pred_context_comp_bwdref_p1:
  447|   183k|int av1_get_pred_context_comp_bwdref_p1(const MACROBLOCKD *xd) {
  448|   183k|  return get_pred_context_brf_or_arf2(xd);
  449|   183k|}
av1_get_pred_context_single_ref_p1:
  455|  3.32M|int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
  456|  3.32M|  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
  457|       |
  458|       |  // Count of forward reference frames
  459|  3.32M|  const int fwd_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME] +
  460|  3.32M|                        ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
  461|       |  // Count of backward reference frames
  462|  3.32M|  const int bwd_count = ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME] +
  463|  3.32M|                        ref_counts[ALTREF_FRAME];
  464|       |
  465|  3.32M|  const int pred_context =
  466|  3.32M|      (fwd_count == bwd_count) ? 1 : ((fwd_count < bwd_count) ? 0 : 2);
  ------------------
  |  Branch (466:7): [True: 369k, False: 2.95M]
  |  Branch (466:39): [True: 341k, False: 2.61M]
  ------------------
  467|       |
  468|  3.32M|  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
  469|  3.32M|  return pred_context;
  470|  3.32M|}
av1_get_pred_context_single_ref_p2:
  475|   472k|int av1_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
  476|   472k|  return get_pred_context_brfarf2_or_arf(xd);
  477|   472k|}
av1_get_pred_context_single_ref_p3:
  481|  2.85M|int av1_get_pred_context_single_ref_p3(const MACROBLOCKD *xd) {
  482|  2.85M|  return get_pred_context_ll2_or_l3gld(xd);
  483|  2.85M|}
av1_get_pred_context_single_ref_p4:
  487|  2.60M|int av1_get_pred_context_single_ref_p4(const MACROBLOCKD *xd) {
  488|  2.60M|  return get_pred_context_last_or_last2(xd);
  489|  2.60M|}
av1_get_pred_context_single_ref_p5:
  493|   245k|int av1_get_pred_context_single_ref_p5(const MACROBLOCKD *xd) {
  494|   245k|  return get_pred_context_last3_or_gld(xd);
  495|   245k|}
av1_get_pred_context_single_ref_p6:
  499|   215k|int av1_get_pred_context_single_ref_p6(const MACROBLOCKD *xd) {
  500|   215k|  return get_pred_context_brf_or_arf2(xd);
  501|   215k|}
pred_common.c:get_ref_filter_type:
   21|  6.04M|                                        MV_REFERENCE_FRAME ref_frame) {
   22|  6.04M|  (void)xd;
   23|       |
   24|  6.04M|  return ((ref_mbmi->ref_frame[0] == ref_frame ||
  ------------------
  |  Branch (24:12): [True: 4.46M, False: 1.58M]
  ------------------
   25|  6.04M|           ref_mbmi->ref_frame[1] == ref_frame)
  ------------------
  |  Branch (25:12): [True: 69.5k, False: 1.51M]
  ------------------
   26|  6.04M|              ? av1_extract_interp_filter(ref_mbmi->interp_filters, dir & 0x01)
   27|  6.04M|              : SWITCHABLE_FILTERS);
   28|  6.04M|}
pred_common.c:palette_add_to_cache:
   66|   441k|static void palette_add_to_cache(uint16_t *cache, int *n, uint16_t val) {
   67|       |  // Do not add an already existing value
   68|   441k|  if (*n > 0 && val == cache[*n - 1]) return;
  ------------------
  |  Branch (68:7): [True: 363k, False: 78.2k]
  |  Branch (68:17): [True: 43.9k, False: 319k]
  ------------------
   69|       |
   70|   397k|  cache[(*n)++] = val;
   71|   397k|}
pred_common.c:get_pred_context_ll2_or_l3gld:
  334|  3.29M|static int get_pred_context_ll2_or_l3gld(const MACROBLOCKD *xd) {
  335|  3.29M|  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
  336|       |
  337|       |  // Count of LAST + LAST2
  338|  3.29M|  const int last_last2_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME];
  339|       |  // Count of LAST3 + GOLDEN
  340|  3.29M|  const int last3_gld_count =
  341|  3.29M|      ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
  342|       |
  343|  3.29M|  const int pred_context = (last_last2_count == last3_gld_count)
  ------------------
  |  Branch (343:28): [True: 320k, False: 2.96M]
  ------------------
  344|  3.29M|                               ? 1
  345|  3.29M|                               : ((last_last2_count < last3_gld_count) ? 0 : 2);
  ------------------
  |  Branch (345:35): [True: 244k, False: 2.72M]
  ------------------
  346|       |
  347|  3.29M|  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
  348|  3.29M|  return pred_context;
  349|  3.29M|}
pred_common.c:get_pred_context_last_or_last2:
  352|  2.94M|static int get_pred_context_last_or_last2(const MACROBLOCKD *xd) {
  353|  2.94M|  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
  354|       |
  355|       |  // Count of LAST
  356|  2.94M|  const int last_count = ref_counts[LAST_FRAME];
  357|       |  // Count of LAST2
  358|  2.94M|  const int last2_count = ref_counts[LAST2_FRAME];
  359|       |
  360|  2.94M|  const int pred_context =
  361|  2.94M|      (last_count == last2_count) ? 1 : ((last_count < last2_count) ? 0 : 2);
  ------------------
  |  Branch (361:7): [True: 241k, False: 2.70M]
  |  Branch (361:42): [True: 87.9k, False: 2.61M]
  ------------------
  362|       |
  363|  2.94M|  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
  364|  2.94M|  return pred_context;
  365|  2.94M|}
pred_common.c:get_pred_context_last3_or_gld:
  368|   347k|static int get_pred_context_last3_or_gld(const MACROBLOCKD *xd) {
  369|   347k|  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
  370|       |
  371|       |  // Count of LAST3
  372|   347k|  const int last3_count = ref_counts[LAST3_FRAME];
  373|       |  // Count of GOLDEN
  374|   347k|  const int gld_count = ref_counts[GOLDEN_FRAME];
  375|       |
  376|   347k|  const int pred_context =
  377|   347k|      (last3_count == gld_count) ? 1 : ((last3_count < gld_count) ? 0 : 2);
  ------------------
  |  Branch (377:7): [True: 90.0k, False: 256k]
  |  Branch (377:41): [True: 178k, False: 78.9k]
  ------------------
  378|       |
  379|   347k|  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
  380|   347k|  return pred_context;
  381|   347k|}
pred_common.c:get_pred_context_brfarf2_or_arf:
  385|   909k|static int get_pred_context_brfarf2_or_arf(const MACROBLOCKD *xd) {
  386|   909k|  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
  387|       |
  388|       |  // Counts of BWDREF, ALTREF2, or ALTREF frames (B, A2, or A)
  389|   909k|  const int brfarf2_count =
  390|   909k|      ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME];
  391|   909k|  const int arf_count = ref_counts[ALTREF_FRAME];
  392|       |
  393|   909k|  const int pred_context =
  394|   909k|      (brfarf2_count == arf_count) ? 1 : ((brfarf2_count < arf_count) ? 0 : 2);
  ------------------
  |  Branch (394:7): [True: 205k, False: 704k]
  |  Branch (394:43): [True: 416k, False: 287k]
  ------------------
  395|       |
  396|   909k|  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
  397|   909k|  return pred_context;
  398|   909k|}
pred_common.c:get_pred_context_brf_or_arf2:
  401|   398k|static int get_pred_context_brf_or_arf2(const MACROBLOCKD *xd) {
  402|   398k|  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
  403|       |
  404|       |  // Count of BWDREF frames (B)
  405|   398k|  const int brf_count = ref_counts[BWDREF_FRAME];
  406|       |  // Count of ALTREF2 frames (A2)
  407|   398k|  const int arf2_count = ref_counts[ALTREF2_FRAME];
  408|       |
  409|   398k|  const int pred_context =
  410|   398k|      (brf_count == arf2_count) ? 1 : ((brf_count < arf2_count) ? 0 : 2);
  ------------------
  |  Branch (410:7): [True: 113k, False: 284k]
  |  Branch (410:40): [True: 155k, False: 129k]
  ------------------
  411|       |
  412|   398k|  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
  413|   398k|  return pred_context;
  414|   398k|}

decodeframe.c:get_tx_size_context:
  342|  1.63M|static inline int get_tx_size_context(const MACROBLOCKD *xd) {
  343|  1.63M|  const MB_MODE_INFO *mbmi = xd->mi[0];
  344|  1.63M|  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
  345|  1.63M|  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
  346|  1.63M|  const TX_SIZE max_tx_size = max_txsize_rect_lookup[mbmi->bsize];
  347|  1.63M|  const int max_tx_wide = tx_size_wide[max_tx_size];
  348|  1.63M|  const int max_tx_high = tx_size_high[max_tx_size];
  349|  1.63M|  const int has_above = xd->up_available;
  350|  1.63M|  const int has_left = xd->left_available;
  351|       |
  352|  1.63M|  int above = xd->above_txfm_context[0] >= max_tx_wide;
  353|  1.63M|  int left = xd->left_txfm_context[0] >= max_tx_high;
  354|       |
  355|  1.63M|  if (has_above)
  ------------------
  |  Branch (355:7): [True: 1.49M, False: 137k]
  ------------------
  356|  1.49M|    if (is_inter_block(above_mbmi))
  ------------------
  |  Branch (356:9): [True: 112k, False: 1.38M]
  ------------------
  357|   112k|      above = block_size_wide[above_mbmi->bsize] >= max_tx_wide;
  358|       |
  359|  1.63M|  if (has_left)
  ------------------
  |  Branch (359:7): [True: 1.50M, False: 130k]
  ------------------
  360|  1.50M|    if (is_inter_block(left_mbmi))
  ------------------
  |  Branch (360:9): [True: 106k, False: 1.39M]
  ------------------
  361|   106k|      left = block_size_high[left_mbmi->bsize] >= max_tx_high;
  362|       |
  363|  1.63M|  if (has_above && has_left)
  ------------------
  |  Branch (363:7): [True: 1.49M, False: 137k]
  |  Branch (363:20): [True: 1.39M, False: 103k]
  ------------------
  364|  1.39M|    return (above + left);
  365|   240k|  else if (has_above)
  ------------------
  |  Branch (365:12): [True: 103k, False: 137k]
  ------------------
  366|   103k|    return above;
  367|   137k|  else if (has_left)
  ------------------
  |  Branch (367:12): [True: 110k, False: 27.1k]
  ------------------
  368|   110k|    return left;
  369|  27.1k|  else
  370|  27.1k|    return 0;
  371|  1.63M|}
decodemv.c:av1_get_spatial_seg_pred:
   51|  4.00M|                                               int skip_over4x4) {
   52|  4.00M|  const int step_size = skip_over4x4 ? 2 : 1;
  ------------------
  |  Branch (52:25): [True: 0, False: 4.00M]
  ------------------
   53|  4.00M|  uint8_t prev_ul = UINT8_MAX;  // top left segment_id
   54|  4.00M|  uint8_t prev_l = UINT8_MAX;   // left segment_id
   55|  4.00M|  uint8_t prev_u = UINT8_MAX;   // top segment_id
   56|  4.00M|  const int mi_row = xd->mi_row;
   57|  4.00M|  const int mi_col = xd->mi_col;
   58|  4.00M|  const CommonModeInfoParams *const mi_params = &cm->mi_params;
   59|  4.00M|  const uint8_t *seg_map = cm->cur_frame->seg_map;
   60|  4.00M|  if ((xd->up_available) && (xd->left_available)) {
  ------------------
  |  Branch (60:7): [True: 3.91M, False: 91.2k]
  |  Branch (60:29): [True: 3.83M, False: 78.0k]
  ------------------
   61|  3.83M|    prev_ul = get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - step_size,
   62|  3.83M|                             mi_col - step_size);
   63|  3.83M|  }
   64|  4.00M|  if (xd->up_available) {
  ------------------
  |  Branch (64:7): [True: 3.91M, False: 91.4k]
  ------------------
   65|  3.91M|    prev_u = get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - step_size,
   66|  3.91M|                            mi_col - 0);
   67|  3.91M|  }
   68|  4.00M|  if (xd->left_available) {
  ------------------
  |  Branch (68:7): [True: 3.91M, False: 91.0k]
  ------------------
   69|  3.91M|    prev_l = get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - 0,
   70|  3.91M|                            mi_col - step_size);
   71|  3.91M|  }
   72|  4.00M|  assert(IMPLIES(prev_ul != UINT8_MAX,
   73|  4.00M|                 prev_u != UINT8_MAX && prev_l != UINT8_MAX));
   74|       |
   75|       |  // Pick CDF index based on number of matching/out-of-bounds segment IDs.
   76|  4.00M|  if (prev_ul == UINT8_MAX) /* Edge cases */
  ------------------
  |  Branch (76:7): [True: 169k, False: 3.83M]
  ------------------
   77|   169k|    *cdf_index = 0;
   78|  3.83M|  else if ((prev_ul == prev_u) && (prev_ul == prev_l))
  ------------------
  |  Branch (78:12): [True: 2.25M, False: 1.57M]
  |  Branch (78:35): [True: 1.36M, False: 888k]
  ------------------
   79|  1.36M|    *cdf_index = 2;
   80|  2.46M|  else if ((prev_ul == prev_u) || (prev_ul == prev_l) || (prev_u == prev_l))
  ------------------
  |  Branch (80:12): [True: 888k, False: 1.57M]
  |  Branch (80:35): [True: 825k, False: 748k]
  |  Branch (80:58): [True: 173k, False: 575k]
  ------------------
   81|  1.88M|    *cdf_index = 1;
   82|   574k|  else
   83|   574k|    *cdf_index = 0;
   84|       |
   85|       |  // If 2 or more are identical returns that as predictor, otherwise prev_l.
   86|  4.00M|  if (prev_u == UINT8_MAX)  // edge case
  ------------------
  |  Branch (86:7): [True: 91.7k, False: 3.91M]
  ------------------
   87|  91.7k|    return prev_l == UINT8_MAX ? 0 : prev_l;
  ------------------
  |  Branch (87:12): [True: 12.6k, False: 79.1k]
  ------------------
   88|  3.91M|  if (prev_l == UINT8_MAX)  // edge case
  ------------------
  |  Branch (88:7): [True: 78.2k, False: 3.83M]
  ------------------
   89|  78.2k|    return prev_u;
   90|  3.83M|  return (prev_ul == prev_u) ? prev_u : prev_l;
  ------------------
  |  Branch (90:10): [True: 2.25M, False: 1.57M]
  ------------------
   91|  3.91M|}
decodemv.c:get_segment_id:
   28|  11.6M|    BLOCK_SIZE bsize, int mi_row, int mi_col) {
   29|  11.6M|  const int mi_offset = mi_row * mi_params->mi_cols + mi_col;
   30|  11.6M|  const int bw = mi_size_wide[bsize];
   31|  11.6M|  const int bh = mi_size_high[bsize];
   32|  11.6M|  const int xmis = AOMMIN(mi_params->mi_cols - mi_col, bw);
  ------------------
  |  |   34|  11.6M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 11.6M]
  |  |  ------------------
  ------------------
   33|  11.6M|  const int ymis = AOMMIN(mi_params->mi_rows - mi_row, bh);
  ------------------
  |  |   34|  11.6M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 11.6M]
  |  |  ------------------
  ------------------
   34|  11.6M|  const int seg_stride = mi_params->mi_cols;
   35|  11.6M|  uint8_t segment_id = MAX_SEGMENTS;
  ------------------
  |  |   21|  11.6M|#define MAX_SEGMENTS 8
  ------------------
   36|       |
   37|  23.3M|  for (int y = 0; y < ymis; ++y) {
  ------------------
  |  Branch (37:19): [True: 11.6M, False: 11.6M]
  ------------------
   38|  23.3M|    for (int x = 0; x < xmis; ++x) {
  ------------------
  |  Branch (38:21): [True: 11.6M, False: 11.6M]
  ------------------
   39|  11.6M|      segment_id =
   40|  11.6M|          AOMMIN(segment_id, segment_ids[mi_offset + y * seg_stride + x]);
  ------------------
  |  |   34|  11.6M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 11.6M]
  |  |  ------------------
  ------------------
   41|  11.6M|    }
   42|  11.6M|  }
   43|       |
   44|  11.6M|  assert(segment_id < MAX_SEGMENTS);
   45|  11.6M|  return segment_id;
   46|  11.6M|}
decodemv.c:av1_get_skip_txfm_context:
  175|  11.7M|static inline int av1_get_skip_txfm_context(const MACROBLOCKD *xd) {
  176|  11.7M|  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
  177|  11.7M|  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
  178|  11.7M|  const int above_skip_txfm = above_mi ? above_mi->skip_txfm : 0;
  ------------------
  |  Branch (178:31): [True: 11.1M, False: 553k]
  ------------------
  179|  11.7M|  const int left_skip_txfm = left_mi ? left_mi->skip_txfm : 0;
  ------------------
  |  Branch (179:30): [True: 11.3M, False: 403k]
  ------------------
  180|  11.7M|  return above_skip_txfm + left_skip_txfm;
  181|  11.7M|}
decodemv.c:av1_get_palette_bsize_ctx:
  192|  2.32M|static inline int av1_get_palette_bsize_ctx(BLOCK_SIZE bsize) {
  193|  2.32M|  assert(bsize < BLOCK_SIZES_ALL);
  194|  2.32M|  return num_pels_log2_lookup[bsize] - num_pels_log2_lookup[BLOCK_8X8];
  195|  2.32M|}
decodemv.c:av1_get_palette_mode_ctx:
  197|   744k|static inline int av1_get_palette_mode_ctx(const MACROBLOCKD *xd) {
  198|   744k|  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
  199|   744k|  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
  200|   744k|  int ctx = 0;
  201|   744k|  if (above_mi) ctx += (above_mi->palette_mode_info.palette_size[0] > 0);
  ------------------
  |  Branch (201:7): [True: 658k, False: 85.4k]
  ------------------
  202|   744k|  if (left_mi) ctx += (left_mi->palette_mode_info.palette_size[0] > 0);
  ------------------
  |  Branch (202:7): [True: 663k, False: 80.8k]
  ------------------
  203|   744k|  return ctx;
  204|   744k|}
decodemv.c:av1_get_pred_context_seg_id:
   93|  37.8k|static inline uint8_t av1_get_pred_context_seg_id(const MACROBLOCKD *xd) {
   94|  37.8k|  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
   95|  37.8k|  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
   96|  37.8k|  const int above_sip = (above_mi != NULL) ? above_mi->seg_id_predicted : 0;
  ------------------
  |  Branch (96:25): [True: 30.8k, False: 6.98k]
  ------------------
   97|  37.8k|  const int left_sip = (left_mi != NULL) ? left_mi->seg_id_predicted : 0;
  ------------------
  |  Branch (97:24): [True: 31.4k, False: 6.40k]
  ------------------
   98|       |
   99|  37.8k|  return above_sip + left_sip;
  100|  37.8k|}
decodemv.c:av1_get_skip_mode_context:
  167|   231k|static inline int av1_get_skip_mode_context(const MACROBLOCKD *xd) {
  168|   231k|  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
  169|   231k|  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
  170|   231k|  const int above_skip_mode = above_mi ? above_mi->skip_mode : 0;
  ------------------
  |  Branch (170:31): [True: 159k, False: 72.4k]
  ------------------
  171|   231k|  const int left_skip_mode = left_mi ? left_mi->skip_mode : 0;
  ------------------
  |  Branch (171:30): [True: 174k, False: 56.5k]
  ------------------
  172|   231k|  return above_skip_mode + left_skip_mode;
  173|   231k|}
decodemv.c:av1_get_pred_cdf_uni_comp_ref_p:
  235|  87.2k|    const MACROBLOCKD *xd) {
  236|  87.2k|  const int pred_context = av1_get_pred_context_uni_comp_ref_p(xd);
  237|  87.2k|  return xd->tile_ctx->uni_comp_ref_cdf[pred_context][0];
  238|  87.2k|}
decodemv.c:av1_get_pred_cdf_uni_comp_ref_p1:
  241|  65.9k|    const MACROBLOCKD *xd) {
  242|  65.9k|  const int pred_context = av1_get_pred_context_uni_comp_ref_p1(xd);
  243|  65.9k|  return xd->tile_ctx->uni_comp_ref_cdf[pred_context][1];
  244|  65.9k|}
decodemv.c:av1_get_pred_cdf_uni_comp_ref_p2:
  247|  39.6k|    const MACROBLOCKD *xd) {
  248|  39.6k|  const int pred_context = av1_get_pred_context_uni_comp_ref_p2(xd);
  249|  39.6k|  return xd->tile_ctx->uni_comp_ref_cdf[pred_context][2];
  250|  39.6k|}
decodemv.c:av1_get_pred_cdf_comp_ref_p:
  264|   437k|static inline aom_cdf_prob *av1_get_pred_cdf_comp_ref_p(const MACROBLOCKD *xd) {
  265|   437k|  const int pred_context = av1_get_pred_context_comp_ref_p(xd);
  266|   437k|  return xd->tile_ctx->comp_ref_cdf[pred_context][0];
  267|   437k|}
decodemv.c:av1_get_pred_cdf_comp_ref_p1:
  270|   335k|    const MACROBLOCKD *xd) {
  271|   335k|  const int pred_context = av1_get_pred_context_comp_ref_p1(xd);
  272|   335k|  return xd->tile_ctx->comp_ref_cdf[pred_context][1];
  273|   335k|}
decodemv.c:av1_get_pred_cdf_comp_ref_p2:
  276|   101k|    const MACROBLOCKD *xd) {
  277|   101k|  const int pred_context = av1_get_pred_context_comp_ref_p2(xd);
  278|   101k|  return xd->tile_ctx->comp_ref_cdf[pred_context][2];
  279|   101k|}
decodemv.c:av1_get_pred_cdf_comp_bwdref_p:
  282|   437k|    const MACROBLOCKD *xd) {
  283|   437k|  const int pred_context = av1_get_pred_context_comp_bwdref_p(xd);
  284|   437k|  return xd->tile_ctx->comp_bwdref_cdf[pred_context][0];
  285|   437k|}
decodemv.c:av1_get_pred_cdf_comp_bwdref_p1:
  288|   183k|    const MACROBLOCKD *xd) {
  289|   183k|  const int pred_context = av1_get_pred_context_comp_bwdref_p1(xd);
  290|   183k|  return xd->tile_ctx->comp_bwdref_cdf[pred_context][1];
  291|   183k|}
decodemv.c:av1_get_pred_cdf_single_ref_p1:
  308|  3.32M|    const MACROBLOCKD *xd) {
  309|  3.32M|  return xd->tile_ctx
  310|  3.32M|      ->single_ref_cdf[av1_get_pred_context_single_ref_p1(xd)][0];
  311|  3.32M|}
decodemv.c:av1_get_pred_cdf_single_ref_p2:
  313|   472k|    const MACROBLOCKD *xd) {
  314|   472k|  return xd->tile_ctx
  315|   472k|      ->single_ref_cdf[av1_get_pred_context_single_ref_p2(xd)][1];
  316|   472k|}
decodemv.c:av1_get_pred_cdf_single_ref_p6:
  333|   215k|    const MACROBLOCKD *xd) {
  334|   215k|  return xd->tile_ctx
  335|   215k|      ->single_ref_cdf[av1_get_pred_context_single_ref_p6(xd)][5];
  336|   215k|}
decodemv.c:av1_get_pred_cdf_single_ref_p3:
  318|  2.85M|    const MACROBLOCKD *xd) {
  319|  2.85M|  return xd->tile_ctx
  320|  2.85M|      ->single_ref_cdf[av1_get_pred_context_single_ref_p3(xd)][2];
  321|  2.85M|}
decodemv.c:av1_get_pred_cdf_single_ref_p5:
  328|   245k|    const MACROBLOCKD *xd) {
  329|   245k|  return xd->tile_ctx
  330|   245k|      ->single_ref_cdf[av1_get_pred_context_single_ref_p5(xd)][4];
  331|   245k|}
decodemv.c:av1_get_pred_cdf_single_ref_p4:
  323|  2.60M|    const MACROBLOCKD *xd) {
  324|  2.60M|  return xd->tile_ctx
  325|  2.60M|      ->single_ref_cdf[av1_get_pred_context_single_ref_p4(xd)][3];
  326|  2.60M|}
decodemv.c:get_comp_group_idx_context:
  141|   500k|static inline int get_comp_group_idx_context(const MACROBLOCKD *xd) {
  142|   500k|  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
  143|   500k|  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
  144|   500k|  int above_ctx = 0, left_ctx = 0;
  145|       |
  146|   500k|  if (above_mi) {
  ------------------
  |  Branch (146:7): [True: 456k, False: 44.1k]
  ------------------
  147|   456k|    if (has_second_ref(above_mi))
  ------------------
  |  Branch (147:9): [True: 295k, False: 160k]
  ------------------
  148|   295k|      above_ctx = above_mi->comp_group_idx;
  149|   160k|    else if (above_mi->ref_frame[0] == ALTREF_FRAME)
  ------------------
  |  Branch (149:14): [True: 21.1k, False: 139k]
  ------------------
  150|  21.1k|      above_ctx = 3;
  151|   456k|  }
  152|   500k|  if (left_mi) {
  ------------------
  |  Branch (152:7): [True: 481k, False: 19.2k]
  ------------------
  153|   481k|    if (has_second_ref(left_mi))
  ------------------
  |  Branch (153:9): [True: 326k, False: 154k]
  ------------------
  154|   326k|      left_ctx = left_mi->comp_group_idx;
  155|   154k|    else if (left_mi->ref_frame[0] == ALTREF_FRAME)
  ------------------
  |  Branch (155:14): [True: 20.4k, False: 134k]
  ------------------
  156|  20.4k|      left_ctx = 3;
  157|   481k|  }
  158|       |
  159|   500k|  return AOMMIN(5, above_ctx + left_ctx);
  ------------------
  |  |   34|   500k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 2.45k, False: 497k]
  |  |  ------------------
  ------------------
  160|   500k|}
decodemv.c:get_comp_index_context:
  103|   337k|                                         const MACROBLOCKD *xd) {
  104|   337k|  MB_MODE_INFO *mbmi = xd->mi[0];
  105|   337k|  const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]);
  106|   337k|  const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]);
  107|   337k|  int bck_frame_index = 0, fwd_frame_index = 0;
  108|   337k|  int cur_frame_index = cm->cur_frame->order_hint;
  109|       |
  110|   337k|  if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint;
  ------------------
  |  Branch (110:7): [True: 337k, False: 18.4E]
  ------------------
  111|   337k|  if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint;
  ------------------
  |  Branch (111:7): [True: 337k, False: 18.4E]
  ------------------
  112|       |
  113|   337k|  int fwd = abs(get_relative_dist(&cm->seq_params->order_hint_info,
  114|   337k|                                  fwd_frame_index, cur_frame_index));
  115|   337k|  int bck = abs(get_relative_dist(&cm->seq_params->order_hint_info,
  116|   337k|                                  cur_frame_index, bck_frame_index));
  117|       |
  118|   337k|  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
  119|   337k|  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
  120|       |
  121|   337k|  int above_ctx = 0, left_ctx = 0;
  122|   337k|  const int offset = (fwd == bck);
  123|       |
  124|   337k|  if (above_mi != NULL) {
  ------------------
  |  Branch (124:7): [True: 309k, False: 27.7k]
  ------------------
  125|   309k|    if (has_second_ref(above_mi))
  ------------------
  |  Branch (125:9): [True: 203k, False: 105k]
  ------------------
  126|   203k|      above_ctx = above_mi->compound_idx;
  127|   105k|    else if (above_mi->ref_frame[0] == ALTREF_FRAME)
  ------------------
  |  Branch (127:14): [True: 15.0k, False: 90.4k]
  ------------------
  128|  15.0k|      above_ctx = 1;
  129|   309k|  }
  130|       |
  131|   337k|  if (left_mi != NULL) {
  ------------------
  |  Branch (131:7): [True: 318k, False: 18.3k]
  ------------------
  132|   318k|    if (has_second_ref(left_mi))
  ------------------
  |  Branch (132:9): [True: 219k, False: 98.9k]
  ------------------
  133|   219k|      left_ctx = left_mi->compound_idx;
  134|  98.9k|    else if (left_mi->ref_frame[0] == ALTREF_FRAME)
  ------------------
  |  Branch (134:14): [True: 14.2k, False: 84.7k]
  ------------------
  135|  14.2k|      left_ctx = 1;
  136|   318k|  }
  137|       |
  138|   337k|  return above_ctx + left_ctx + 3 * offset;
  139|   337k|}

av1_dc_quant_QTX:
  198|  47.1M|int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
  199|  47.1M|  const int q_clamped = clamp(qindex + delta, 0, MAXQ);
  ------------------
  |  |   26|  47.1M|#define MAXQ 255
  ------------------
  200|  47.1M|  switch (bit_depth) {
  201|  23.7M|    case AOM_BITS_8: return dc_qlookup_QTX[q_clamped];
  ------------------
  |  Branch (201:5): [True: 23.7M, False: 23.4M]
  ------------------
  202|  15.0M|    case AOM_BITS_10: return dc_qlookup_10_QTX[q_clamped];
  ------------------
  |  Branch (202:5): [True: 15.0M, False: 32.0M]
  ------------------
  203|  8.33M|    case AOM_BITS_12: return dc_qlookup_12_QTX[q_clamped];
  ------------------
  |  Branch (203:5): [True: 8.33M, False: 38.7M]
  ------------------
  204|      0|    default:
  ------------------
  |  Branch (204:5): [True: 0, False: 47.1M]
  ------------------
  205|      0|      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
  206|      0|      return -1;
  207|  47.1M|  }
  208|  47.1M|}
av1_ac_quant_QTX:
  210|  47.1M|int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
  211|  47.1M|  const int q_clamped = clamp(qindex + delta, 0, MAXQ);
  ------------------
  |  |   26|  47.1M|#define MAXQ 255
  ------------------
  212|  47.1M|  switch (bit_depth) {
  213|  23.7M|    case AOM_BITS_8: return ac_qlookup_QTX[q_clamped];
  ------------------
  |  Branch (213:5): [True: 23.7M, False: 23.4M]
  ------------------
  214|  15.0M|    case AOM_BITS_10: return ac_qlookup_10_QTX[q_clamped];
  ------------------
  |  Branch (214:5): [True: 15.0M, False: 32.0M]
  ------------------
  215|  8.33M|    case AOM_BITS_12: return ac_qlookup_12_QTX[q_clamped];
  ------------------
  |  Branch (215:5): [True: 8.33M, False: 38.7M]
  ------------------
  216|      0|    default:
  ------------------
  |  Branch (216:5): [True: 0, False: 47.1M]
  ------------------
  217|      0|      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
  218|      0|      return -1;
  219|  47.1M|  }
  220|  47.1M|}
av1_get_qindex:
  223|  18.8M|                   int base_qindex) {
  224|  18.8M|  if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
  ------------------
  |  Branch (224:7): [True: 6.87M, False: 11.9M]
  ------------------
  225|  6.87M|    const int data = get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
  226|  6.87M|    const int seg_qindex = base_qindex + data;
  227|  6.87M|    return clamp(seg_qindex, 0, MAXQ);
  ------------------
  |  |   26|  6.87M|#define MAXQ 255
  ------------------
  228|  11.9M|  } else {
  229|  11.9M|    return base_qindex;
  230|  11.9M|  }
  231|  18.8M|}
av1_use_qmatrix:
  234|   347k|                     const struct macroblockd *xd, int segment_id) {
  235|       |  // True if explicit Q matrix levels and this is not a lossless segment.
  236|   347k|  return quant_params->using_qmatrix && !xd->lossless[segment_id];
  ------------------
  |  Branch (236:10): [True: 101k, False: 246k]
  |  Branch (236:41): [True: 90.0k, False: 11.3k]
  ------------------
  237|   347k|}
av1_get_iqmatrix:
  245|  21.0M|                                 TX_SIZE tx_size, TX_TYPE tx_type) {
  246|  21.0M|  const struct macroblockd_plane *const pd = &xd->plane[plane];
  247|  21.0M|  const MB_MODE_INFO *const mbmi = xd->mi[0];
  248|  21.0M|  const int seg_id = mbmi->segment_id;
  249|  21.0M|  const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
  250|       |  // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms
  251|  21.0M|  return is_2d_transform(tx_type)
  ------------------
  |  Branch (251:10): [True: 18.3M, False: 2.70M]
  ------------------
  252|  21.0M|             ? pd->seg_iqmatrix[seg_id][qm_tx_size]
  253|  21.0M|             : quant_params->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
  ------------------
  |  |   31|  2.70M|#define NUM_QM_LEVELS (1 << QM_LEVEL_BITS)
  |  |  ------------------
  |  |  |  |   30|  2.70M|#define QM_LEVEL_BITS 4
  |  |  ------------------
  ------------------
  254|  21.0M|}
av1_qm_init:
  277|  16.1k|void av1_qm_init(CommonQuantParams *quant_params, int num_planes) {
  278|  16.1k|#if CONFIG_QUANT_MATRIX || CONFIG_AV1_DECODER
  279|   273k|  for (int q = 0; q < NUM_QM_LEVELS; ++q) {
  ------------------
  |  |   31|   273k|#define NUM_QM_LEVELS (1 << QM_LEVEL_BITS)
  |  |  ------------------
  |  |  |  |   30|   273k|#define QM_LEVEL_BITS 4
  |  |  ------------------
  ------------------
  |  Branch (279:19): [True: 257k, False: 16.1k]
  ------------------
  280|  1.03M|    for (int c = 0; c < num_planes; ++c) {
  ------------------
  |  Branch (280:21): [True: 773k, False: 257k]
  ------------------
  281|   773k|      int current = 0;
  282|  15.4M|      for (int t = 0; t < TX_SIZES_ALL; ++t) {
  ------------------
  |  Branch (282:23): [True: 14.6M, False: 773k]
  ------------------
  283|  14.6M|        const int size = tx_size_2d[t];
  284|  14.6M|        const int qm_tx_size = av1_get_adjusted_tx_size(t);
  285|  14.6M|        if (q == NUM_QM_LEVELS - 1) {
  ------------------
  |  |   31|  14.6M|#define NUM_QM_LEVELS (1 << QM_LEVEL_BITS)
  |  |  ------------------
  |  |  |  |   30|  14.6M|#define QM_LEVEL_BITS 4
  |  |  ------------------
  ------------------
  |  Branch (285:13): [True: 918k, False: 13.7M]
  ------------------
  286|   918k|          quant_params->gqmatrix[q][c][t] = NULL;
  287|   918k|          quant_params->giqmatrix[q][c][t] = NULL;
  288|  13.7M|        } else if (t != qm_tx_size) {  // Reuse matrices for 'qm_tx_size'
  ------------------
  |  Branch (288:20): [True: 3.62M, False: 10.1M]
  ------------------
  289|  3.62M|          assert(t > qm_tx_size);
  290|  3.62M|          quant_params->gqmatrix[q][c][t] =
  291|  3.62M|              quant_params->gqmatrix[q][c][qm_tx_size];
  292|  3.62M|          quant_params->giqmatrix[q][c][t] =
  293|  3.62M|              quant_params->giqmatrix[q][c][qm_tx_size];
  294|  10.1M|        } else {
  295|  10.1M|          assert(current + size <= QM_TOTAL_SIZE);
  296|  10.1M|          quant_params->gqmatrix[q][c][t] = &wt_matrix_ref[q][c >= 1][current];
  297|  10.1M|          quant_params->giqmatrix[q][c][t] =
  298|  10.1M|              &iwt_matrix_ref[q][c >= 1][current];
  299|  10.1M|          current += size;
  300|  10.1M|        }
  301|  14.6M|      }
  302|   773k|    }
  303|   257k|  }
  304|       |#else
  305|       |  (void)quant_params;
  306|       |  (void)num_planes;
  307|       |#endif  // CONFIG_QUANT_MATRIX || CONFIG_AV1_DECODER
  308|  16.1k|}
quant_common.c:is_2d_transform:
  241|  21.1M|static inline bool is_2d_transform(TX_TYPE tx_type) { return (tx_type < IDTX); }

av1_init_warp_params:
   60|  11.7M|                          const MACROBLOCKD *xd, const MB_MODE_INFO *mi) {
   61|  11.7M|  if (inter_pred_params->block_height < 8 || inter_pred_params->block_width < 8)
  ------------------
  |  Branch (61:7): [True: 3.96M, False: 7.80M]
  |  Branch (61:46): [True: 1.42M, False: 6.37M]
  ------------------
   62|  5.39M|    return;
   63|       |
   64|  6.37M|  if (xd->cur_frame_force_integer_mv) return;
  ------------------
  |  Branch (64:7): [True: 138k, False: 6.23M]
  ------------------
   65|       |
   66|  6.23M|  if (allow_warp(mi, warp_types, &xd->global_motion[mi->ref_frame[ref]], 0,
  ------------------
  |  Branch (66:7): [True: 502k, False: 5.73M]
  ------------------
   67|  6.23M|                 inter_pred_params->scale_factors,
   68|  6.23M|                 &inter_pred_params->warp_params)) {
   69|       |#if CONFIG_REALTIME_ONLY && !CONFIG_AV1_DECODER
   70|       |    aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_FEATURE,
   71|       |                       "Warped motion is disabled in realtime only build.");
   72|       |#endif  // CONFIG_REALTIME_ONLY && !CONFIG_AV1_DECODER
   73|   502k|    inter_pred_params->mode = WARP_PRED;
   74|   502k|  }
   75|  6.23M|}
av1_make_inter_predictor:
   80|  17.0M|                              const SubpelParams *subpel_params) {
   81|  17.0M|  assert(IMPLIES(inter_pred_params->conv_params.is_compound,
   82|  17.0M|                 inter_pred_params->conv_params.dst != NULL));
   83|       |
   84|  17.0M|  if (inter_pred_params->mode == TRANSLATION_PRED) {
  ------------------
  |  Branch (84:7): [True: 16.5M, False: 502k]
  ------------------
   85|  16.5M|#if CONFIG_AV1_HIGHBITDEPTH
   86|  16.5M|    if (inter_pred_params->use_hbd_buf) {
  ------------------
  |  Branch (86:9): [True: 8.38M, False: 8.14M]
  ------------------
   87|  8.38M|      highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_params,
   88|  8.38M|                             inter_pred_params->block_width,
   89|  8.38M|                             inter_pred_params->block_height,
   90|  8.38M|                             &inter_pred_params->conv_params,
   91|  8.38M|                             inter_pred_params->interp_filter_params,
   92|  8.38M|                             inter_pred_params->bit_depth);
   93|  8.38M|    } else {
   94|  8.14M|      inter_predictor(src, src_stride, dst, dst_stride, subpel_params,
   95|  8.14M|                      inter_pred_params->block_width,
   96|  8.14M|                      inter_pred_params->block_height,
   97|  8.14M|                      &inter_pred_params->conv_params,
   98|  8.14M|                      inter_pred_params->interp_filter_params);
   99|  8.14M|    }
  100|       |#else
  101|       |    inter_predictor(src, src_stride, dst, dst_stride, subpel_params,
  102|       |                    inter_pred_params->block_width,
  103|       |                    inter_pred_params->block_height,
  104|       |                    &inter_pred_params->conv_params,
  105|       |                    inter_pred_params->interp_filter_params);
  106|       |#endif
  107|  16.5M|  }
  108|   502k|#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
  109|       |  // TODO(jingning): av1_warp_plane() can be further cleaned up.
  110|   502k|  else if (inter_pred_params->mode == WARP_PRED) {
  ------------------
  |  Branch (110:12): [True: 502k, False: 18.4E]
  ------------------
  111|   502k|    av1_warp_plane(
  112|   502k|        &inter_pred_params->warp_params, inter_pred_params->use_hbd_buf,
  113|   502k|        inter_pred_params->bit_depth, inter_pred_params->ref_frame_buf.buf0,
  114|   502k|        inter_pred_params->ref_frame_buf.width,
  115|   502k|        inter_pred_params->ref_frame_buf.height,
  116|   502k|        inter_pred_params->ref_frame_buf.stride, dst,
  117|   502k|        inter_pred_params->pix_col, inter_pred_params->pix_row,
  118|   502k|        inter_pred_params->block_width, inter_pred_params->block_height,
  119|   502k|        dst_stride, inter_pred_params->subsampling_x,
  120|   502k|        inter_pred_params->subsampling_y, &inter_pred_params->conv_params);
  121|   502k|  }
  122|  18.4E|#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
  123|  18.4E|  else {
  124|  18.4E|    assert(0 && "Unsupported inter_pred_params->mode");
  125|  18.4E|  }
  126|  17.0M|}
av1_get_compound_type_mask:
  291|   410k|    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type) {
  292|   410k|  (void)sb_type;
  293|   410k|  switch (comp_data->type) {
  294|   164k|    case COMPOUND_WEDGE:
  ------------------
  |  Branch (294:5): [True: 164k, False: 245k]
  ------------------
  295|   164k|      return av1_get_contiguous_soft_mask(comp_data->wedge_index,
  296|   164k|                                          comp_data->wedge_sign, sb_type);
  297|   245k|    default: return comp_data->seg_mask;
  ------------------
  |  Branch (297:5): [True: 245k, False: 164k]
  ------------------
  298|   410k|  }
  299|   410k|}
av1_init_wedge_masks:
  600|  16.1k|void av1_init_wedge_masks(void) { aom_once(init_all_wedge_masks); }
av1_make_masked_inter_predictor:
  632|   410k|                                     const SubpelParams *subpel_params) {
  633|   410k|  const INTERINTER_COMPOUND_DATA *comp_data = &inter_pred_params->mask_comp;
  634|   410k|  BLOCK_SIZE sb_type = inter_pred_params->sb_type;
  635|       |
  636|       |  // We're going to call av1_make_inter_predictor to generate a prediction into
  637|       |  // a temporary buffer, then will blend that temporary buffer with that from
  638|       |  // the other reference.
  639|   410k|  DECLARE_ALIGNED(32, uint8_t, tmp_buf[2 * MAX_SB_SQUARE]);
  ------------------
  |  |   19|   410k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
  640|   410k|  uint8_t *tmp_dst =
  641|   410k|      inter_pred_params->use_hbd_buf ? CONVERT_TO_BYTEPTR(tmp_buf) : tmp_buf;
  ------------------
  |  |   76|   284k|#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
  ------------------
  |  Branch (641:7): [True: 284k, False: 125k]
  ------------------
  642|       |
  643|   410k|  const int tmp_buf_stride = MAX_SB_SIZE;
  ------------------
  |  |   32|   410k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   410k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  644|   410k|  CONV_BUF_TYPE *org_dst = inter_pred_params->conv_params.dst;
  645|   410k|  int org_dst_stride = inter_pred_params->conv_params.dst_stride;
  646|   410k|  CONV_BUF_TYPE *tmp_buf16 = (CONV_BUF_TYPE *)tmp_buf;
  647|   410k|  inter_pred_params->conv_params.dst = tmp_buf16;
  648|   410k|  inter_pred_params->conv_params.dst_stride = tmp_buf_stride;
  649|   410k|  assert(inter_pred_params->conv_params.do_average == 0);
  650|       |
  651|       |  // This will generate a prediction in tmp_buf for the second reference
  652|   410k|  av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE,
  ------------------
  |  |   32|   410k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   410k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  653|   410k|                           inter_pred_params, subpel_params);
  654|       |
  655|   410k|  if (!inter_pred_params->conv_params.plane &&
  ------------------
  |  Branch (655:7): [True: 137k, False: 273k]
  ------------------
  656|   410k|      comp_data->type == COMPOUND_DIFFWTD) {
  ------------------
  |  Branch (656:7): [True: 82.3k, False: 55.1k]
  ------------------
  657|  82.3k|    av1_build_compound_diffwtd_mask_d16(
  658|  82.3k|        comp_data->seg_mask, comp_data->mask_type, org_dst, org_dst_stride,
  659|  82.3k|        tmp_buf16, tmp_buf_stride, inter_pred_params->block_height,
  660|  82.3k|        inter_pred_params->block_width, &inter_pred_params->conv_params,
  661|  82.3k|        inter_pred_params->bit_depth);
  662|  82.3k|  }
  663|   410k|  build_masked_compound_no_round(
  664|   410k|      dst, dst_stride, org_dst, org_dst_stride, tmp_buf16, tmp_buf_stride,
  665|   410k|      comp_data, sb_type, inter_pred_params->block_height,
  666|   410k|      inter_pred_params->block_width, inter_pred_params);
  667|   410k|}
av1_dist_wtd_comp_weight_assign:
  673|  14.6M|                                     int is_compound) {
  674|  14.6M|  assert(fwd_offset != NULL && bck_offset != NULL);
  675|  14.6M|  if (!is_compound || mbmi->compound_idx) {
  ------------------
  |  Branch (675:7): [True: 11.2M, False: 3.36M]
  |  Branch (675:23): [True: 2.74M, False: 627k]
  ------------------
  676|  14.0M|    *fwd_offset = 8;
  677|  14.0M|    *bck_offset = 8;
  678|  14.0M|    *use_dist_wtd_comp_avg = 0;
  679|  14.0M|    return;
  680|  14.0M|  }
  681|       |
  682|   627k|  *use_dist_wtd_comp_avg = 1;
  683|   627k|  const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]);
  684|   627k|  const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]);
  685|   627k|  const int cur_frame_index = cm->cur_frame->order_hint;
  686|   627k|  int bck_frame_index = 0, fwd_frame_index = 0;
  687|       |
  688|   627k|  if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint;
  ------------------
  |  Branch (688:7): [True: 627k, False: 18.4E]
  ------------------
  689|   627k|  if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint;
  ------------------
  |  Branch (689:7): [True: 627k, False: 18.4E]
  ------------------
  690|       |
  691|   627k|  int d0 = clamp(abs(get_relative_dist(&cm->seq_params->order_hint_info,
  692|   627k|                                       fwd_frame_index, cur_frame_index)),
  693|   627k|                 0, MAX_FRAME_DISTANCE);
  ------------------
  |  |   68|   627k|#define MAX_FRAME_DISTANCE ((1 << FRAME_OFFSET_BITS) - 1)
  |  |  ------------------
  |  |  |  |   67|   627k|#define FRAME_OFFSET_BITS 5
  |  |  ------------------
  ------------------
  694|   627k|  int d1 = clamp(abs(get_relative_dist(&cm->seq_params->order_hint_info,
  695|   627k|                                       cur_frame_index, bck_frame_index)),
  696|   627k|                 0, MAX_FRAME_DISTANCE);
  ------------------
  |  |   68|   627k|#define MAX_FRAME_DISTANCE ((1 << FRAME_OFFSET_BITS) - 1)
  |  |  ------------------
  |  |  |  |   67|   627k|#define FRAME_OFFSET_BITS 5
  |  |  ------------------
  ------------------
  697|       |
  698|   627k|  const int order = d0 <= d1;
  699|       |
  700|   627k|  if (d0 == 0 || d1 == 0) {
  ------------------
  |  Branch (700:7): [True: 3.81k, False: 623k]
  |  Branch (700:18): [True: 1.30k, False: 622k]
  ------------------
  701|  5.40k|    *fwd_offset = quant_dist_lookup_table[3][order];
  702|  5.40k|    *bck_offset = quant_dist_lookup_table[3][1 - order];
  703|  5.40k|    return;
  704|  5.40k|  }
  705|       |
  706|   622k|  int i;
  707|  1.09M|  for (i = 0; i < 3; ++i) {
  ------------------
  |  Branch (707:15): [True: 941k, False: 150k]
  ------------------
  708|   941k|    int c0 = quant_dist_weight[i][order];
  709|   941k|    int c1 = quant_dist_weight[i][!order];
  710|   941k|    int d0_c0 = d0 * c0;
  711|   941k|    int d1_c1 = d1 * c1;
  712|   941k|    if ((d0 > d1 && d0_c0 < d1_c1) || (d0 <= d1 && d0_c0 > d1_c1)) break;
  ------------------
  |  Branch (712:10): [True: 68.3k, False: 873k]
  |  Branch (712:21): [True: 9.48k, False: 58.8k]
  |  Branch (712:40): [True: 873k, False: 58.8k]
  |  Branch (712:52): [True: 461k, False: 411k]
  ------------------
  713|   941k|  }
  714|       |
  715|   622k|  *fwd_offset = quant_dist_lookup_table[i][order];
  716|   622k|  *bck_offset = quant_dist_lookup_table[i][1 - order];
  717|   622k|}
av1_setup_dst_planes:
  721|  24.7M|                          const int plane_start, const int plane_end) {
  722|       |  // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
  723|       |  // the static analysis warnings.
  724|  94.8M|  for (int i = plane_start; i < AOMMIN(plane_end, MAX_MB_PLANE); ++i) {
  ------------------
  |  |   34|  94.8M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 3.17M, False: 91.6M]
  |  |  ------------------
  ------------------
  |  Branch (724:29): [True: 70.1M, False: 24.7M]
  ------------------
  725|  70.1M|    struct macroblockd_plane *const pd = &planes[i];
  726|  70.1M|    const int is_uv = i > 0;
  727|  70.1M|    setup_pred_plane(&pd->dst, bsize, src->buffers[i], src->crop_widths[is_uv],
  728|  70.1M|                     src->crop_heights[is_uv], src->strides[is_uv], mi_row,
  729|  70.1M|                     mi_col, NULL, pd->subsampling_x, pd->subsampling_y);
  730|  70.1M|  }
  731|  24.7M|}
av1_setup_pre_planes:
  736|  6.00M|                          const int num_planes) {
  737|  6.00M|  if (src != NULL) {
  ------------------
  |  Branch (737:7): [True: 6.00M, False: 18.4E]
  ------------------
  738|       |    // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
  739|       |    // the static analysis warnings.
  740|  23.9M|    for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
  ------------------
  |  |   34|  23.9M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 52.2k, False: 23.8M]
  |  |  ------------------
  ------------------
  |  Branch (740:21): [True: 17.9M, False: 6.00M]
  ------------------
  741|  17.9M|      struct macroblockd_plane *const pd = &xd->plane[i];
  742|  17.9M|      const int is_uv = i > 0;
  743|  17.9M|      setup_pred_plane(&pd->pre[idx], xd->mi[0]->bsize, src->buffers[i],
  744|  17.9M|                       src->crop_widths[is_uv], src->crop_heights[is_uv],
  745|  17.9M|                       src->strides[is_uv], mi_row, mi_col, sf,
  746|  17.9M|                       pd->subsampling_x, pd->subsampling_y);
  747|  17.9M|    }
  748|  6.00M|  }
  749|  6.00M|}
av1_get_obmc_mask:
  774|  2.88M|const uint8_t *av1_get_obmc_mask(int length) {
  775|  2.88M|  switch (length) {
  776|      0|    case 1: return obmc_mask_1;
  ------------------
  |  Branch (776:5): [True: 0, False: 2.88M]
  ------------------
  777|   585k|    case 2: return obmc_mask_2;
  ------------------
  |  Branch (777:5): [True: 585k, False: 2.29M]
  ------------------
  778|  1.40M|    case 4: return obmc_mask_4;
  ------------------
  |  Branch (778:5): [True: 1.40M, False: 1.47M]
  ------------------
  779|   708k|    case 8: return obmc_mask_8;
  ------------------
  |  Branch (779:5): [True: 708k, False: 2.17M]
  ------------------
  780|   161k|    case 16: return obmc_mask_16;
  ------------------
  |  Branch (780:5): [True: 161k, False: 2.72M]
  ------------------
  781|  20.0k|    case 32: return obmc_mask_32;
  ------------------
  |  Branch (781:5): [True: 20.0k, False: 2.86M]
  ------------------
  782|      0|    case 64: return obmc_mask_64;
  ------------------
  |  Branch (782:5): [True: 0, False: 2.88M]
  ------------------
  783|      0|    default: assert(0); return NULL;
  ------------------
  |  Branch (783:5): [True: 0, False: 2.88M]
  ------------------
  784|  2.88M|  }
  785|  2.88M|}
av1_count_overlappable_neighbors:
  801|  4.55M|void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd) {
  802|  4.55M|  MB_MODE_INFO *mbmi = xd->mi[0];
  803|       |
  804|  4.55M|  mbmi->overlappable_neighbors = 0;
  805|       |
  806|  4.55M|  if (!is_motion_variation_allowed_bsize(mbmi->bsize)) return;
  ------------------
  |  Branch (806:7): [True: 1.52M, False: 3.02M]
  ------------------
  807|       |
  808|  3.02M|  foreach_overlappable_nb_above(cm, xd, INT_MAX, increment_int_ptr,
  809|  3.02M|                                &mbmi->overlappable_neighbors);
  810|  3.02M|  if (mbmi->overlappable_neighbors) return;
  ------------------
  |  Branch (810:7): [True: 2.52M, False: 498k]
  ------------------
  811|   498k|  foreach_overlappable_nb_left(cm, xd, INT_MAX, increment_int_ptr,
  812|   498k|                               &mbmi->overlappable_neighbors);
  813|   498k|}
av1_skip_u4x4_pred_in_obmc:
  821|  7.48M|                               const struct macroblockd_plane *pd, int dir) {
  822|  7.48M|  assert(is_motion_variation_allowed_bsize(bsize));
  823|       |
  824|  7.48M|  const BLOCK_SIZE bsize_plane =
  825|  7.48M|      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
  826|  7.48M|  switch (bsize_plane) {
  827|       |#if DISABLE_CHROMA_U8X8_OBMC
  828|       |    case BLOCK_4X4:
  829|       |    case BLOCK_8X4:
  830|       |    case BLOCK_4X8: return 1;
  831|       |#else
  832|  1.18M|    case BLOCK_4X4:
  ------------------
  |  Branch (832:5): [True: 1.18M, False: 6.29M]
  ------------------
  833|  2.63M|    case BLOCK_8X4:
  ------------------
  |  Branch (833:5): [True: 1.45M, False: 6.02M]
  ------------------
  834|  3.38M|    case BLOCK_4X8: return dir == 0;
  ------------------
  |  Branch (834:5): [True: 741k, False: 6.73M]
  ------------------
  835|      0|#endif
  836|  4.09M|    default: return 0;
  ------------------
  |  Branch (836:5): [True: 4.09M, False: 3.38M]
  ------------------
  837|  7.48M|  }
  838|  7.48M|}
av1_build_obmc_inter_prediction:
  939|   633k|                                     int left_stride[MAX_MB_PLANE]) {
  940|   633k|  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
  941|       |
  942|       |  // handle above row
  943|   633k|  struct obmc_inter_pred_ctxt ctxt_above = { above, above_stride };
  944|   633k|  foreach_overlappable_nb_above(cm, xd,
  945|   633k|                                max_neighbor_obmc[mi_size_wide_log2[bsize]],
  946|   633k|                                build_obmc_inter_pred_above, &ctxt_above);
  947|       |
  948|       |  // handle left column
  949|   633k|  struct obmc_inter_pred_ctxt ctxt_left = { left, left_stride };
  950|   633k|  foreach_overlappable_nb_left(cm, xd,
  951|   633k|                               max_neighbor_obmc[mi_size_high_log2[bsize]],
  952|   633k|                               build_obmc_inter_pred_left, &ctxt_left);
  953|   633k|}
av1_setup_obmc_dst_bufs:
  956|   633k|                             uint8_t **dst_buf2) {
  957|   633k|  if (is_cur_buf_hbd(xd)) {
  ------------------
  |  Branch (957:7): [True: 297k, False: 335k]
  ------------------
  958|   297k|    int len = sizeof(uint16_t);
  959|   297k|    dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
  ------------------
  |  |   76|   297k|#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
  ------------------
  960|   297k|    dst_buf1[1] =
  961|   297k|        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len);
  ------------------
  |  |   76|   297k|#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
  ------------------
  962|   297k|    dst_buf1[2] =
  963|   297k|        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len);
  ------------------
  |  |   76|   297k|#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
  ------------------
  964|   297k|    dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]);
  ------------------
  |  |   76|   297k|#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
  ------------------
  965|   297k|    dst_buf2[1] =
  966|   297k|        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len);
  ------------------
  |  |   76|   297k|#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
  ------------------
  967|   297k|    dst_buf2[2] =
  968|   297k|        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len);
  ------------------
  |  |   76|   297k|#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
  ------------------
  969|   335k|  } else {
  970|   335k|    dst_buf1[0] = xd->tmp_obmc_bufs[0];
  971|   335k|    dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE;
  ------------------
  |  |   33|   335k|#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
  |  |  ------------------
  |  |  |  |   32|   335k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|   335k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |               #define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
  |  |  ------------------
  |  |  |  |   32|   335k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|   335k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  972|   335k|    dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2;
  ------------------
  |  |   33|   335k|#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
  |  |  ------------------
  |  |  |  |   32|   335k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|   335k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |               #define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
  |  |  ------------------
  |  |  |  |   32|   335k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|   335k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  973|   335k|    dst_buf2[0] = xd->tmp_obmc_bufs[1];
  974|   335k|    dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE;
  ------------------
  |  |   33|   335k|#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
  |  |  ------------------
  |  |  |  |   32|   335k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|   335k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |               #define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
  |  |  ------------------
  |  |  |  |   32|   335k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|   335k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  975|   335k|    dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
  ------------------
  |  |   33|   335k|#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
  |  |  ------------------
  |  |  |  |   32|   335k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|   335k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |               #define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
  |  |  ------------------
  |  |  |  |   32|   335k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|   335k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  976|   335k|  }
  977|   633k|}
av1_setup_build_prediction_by_above_pred:
  983|   627k|    const int num_planes) {
  984|   627k|  const BLOCK_SIZE a_bsize = AOMMAX(BLOCK_8X8, above_mbmi->bsize);
  ------------------
  |  |   35|   627k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 42.2k, False: 584k]
  |  |  ------------------
  ------------------
  985|   627k|  const int above_mi_col = xd->mi_col + rel_mi_col;
  986|       |
  987|   627k|  modify_neighbor_predictor_for_obmc(above_mbmi);
  988|       |
  989|  2.50M|  for (int j = 0; j < num_planes; ++j) {
  ------------------
  |  Branch (989:19): [True: 1.87M, False: 627k]
  ------------------
  990|  1.87M|    struct macroblockd_plane *const pd = &xd->plane[j];
  991|  1.87M|    setup_pred_plane(&pd->dst, a_bsize, ctxt->tmp_buf[j], ctxt->tmp_width[j],
  992|  1.87M|                     ctxt->tmp_height[j], ctxt->tmp_stride[j], 0, rel_mi_col,
  993|  1.87M|                     NULL, pd->subsampling_x, pd->subsampling_y);
  994|  1.87M|  }
  995|       |
  996|   627k|  const int num_refs = 1 + has_second_ref(above_mbmi);
  997|       |
  998|  1.25M|  for (int ref = 0; ref < num_refs; ++ref) {
  ------------------
  |  Branch (998:21): [True: 627k, False: 627k]
  ------------------
  999|   627k|    const MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref];
 1000|       |
 1001|   627k|    const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame);
 1002|   627k|    const struct scale_factors *const sf =
 1003|   627k|        get_ref_scale_factors_const(ctxt->cm, frame);
 1004|   627k|    xd->block_ref_scale_factors[ref] = sf;
 1005|   627k|    if ((!av1_is_valid_scale(sf)))
  ------------------
  |  Branch (1005:9): [True: 0, False: 627k]
  ------------------
 1006|      0|      aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
 1007|      0|                         "Reference frame has invalid dimensions");
 1008|   627k|    av1_setup_pre_planes(xd, ref, &ref_buf->buf, xd->mi_row, above_mi_col, sf,
 1009|   627k|                         num_planes);
 1010|   627k|  }
 1011|       |
 1012|   627k|  xd->mb_to_left_edge = 8 * MI_SIZE * (-above_mi_col);
  ------------------
  |  |   40|   627k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   627k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1013|   627k|  xd->mb_to_right_edge =
 1014|   627k|      ctxt->mb_to_far_edge +
 1015|   627k|      (xd->width - rel_mi_col - above_mi_width) * MI_SIZE * 8;
  ------------------
  |  |   40|   627k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   627k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1016|   627k|}
av1_setup_build_prediction_by_left_pred:
 1022|   622k|                                             const int num_planes) {
 1023|   622k|  const BLOCK_SIZE l_bsize = AOMMAX(BLOCK_8X8, left_mbmi->bsize);
  ------------------
  |  |   35|   622k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 43.2k, False: 578k]
  |  |  ------------------
  ------------------
 1024|   622k|  const int left_mi_row = xd->mi_row + rel_mi_row;
 1025|       |
 1026|   622k|  modify_neighbor_predictor_for_obmc(left_mbmi);
 1027|       |
 1028|  2.48M|  for (int j = 0; j < num_planes; ++j) {
  ------------------
  |  Branch (1028:19): [True: 1.86M, False: 622k]
  ------------------
 1029|  1.86M|    struct macroblockd_plane *const pd = &xd->plane[j];
 1030|  1.86M|    setup_pred_plane(&pd->dst, l_bsize, ctxt->tmp_buf[j], ctxt->tmp_width[j],
 1031|  1.86M|                     ctxt->tmp_height[j], ctxt->tmp_stride[j], rel_mi_row, 0,
 1032|  1.86M|                     NULL, pd->subsampling_x, pd->subsampling_y);
 1033|  1.86M|  }
 1034|       |
 1035|   622k|  const int num_refs = 1 + has_second_ref(left_mbmi);
 1036|       |
 1037|  1.24M|  for (int ref = 0; ref < num_refs; ++ref) {
  ------------------
  |  Branch (1037:21): [True: 622k, False: 622k]
  ------------------
 1038|   622k|    const MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref];
 1039|       |
 1040|   622k|    const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame);
 1041|   622k|    const struct scale_factors *const ref_scale_factors =
 1042|   622k|        get_ref_scale_factors_const(ctxt->cm, frame);
 1043|       |
 1044|   622k|    xd->block_ref_scale_factors[ref] = ref_scale_factors;
 1045|   622k|    if ((!av1_is_valid_scale(ref_scale_factors)))
  ------------------
  |  Branch (1045:9): [True: 0, False: 622k]
  ------------------
 1046|      0|      aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
 1047|      0|                         "Reference frame has invalid dimensions");
 1048|   622k|    av1_setup_pre_planes(xd, ref, &ref_buf->buf, left_mi_row, xd->mi_col,
 1049|   622k|                         ref_scale_factors, num_planes);
 1050|   622k|  }
 1051|       |
 1052|   622k|  xd->mb_to_top_edge = GET_MV_SUBPEL(MI_SIZE * (-left_mi_row));
  ------------------
  |  |   29|   622k|#define GET_MV_SUBPEL(x) ((x)*8)
  ------------------
 1053|   622k|  xd->mb_to_bottom_edge =
 1054|   622k|      ctxt->mb_to_far_edge +
 1055|   622k|      GET_MV_SUBPEL((xd->height - rel_mi_row - left_mi_height) * MI_SIZE);
  ------------------
  |  |   29|   622k|#define GET_MV_SUBPEL(x) ((x)*8)
  ------------------
 1056|   622k|}
av1_build_intra_predictors_for_interintra:
 1119|   958k|                                               uint8_t *dst, int dst_stride) {
 1120|   958k|  struct macroblockd_plane *const pd = &xd->plane[plane];
 1121|   958k|  const int ssx = xd->plane[plane].subsampling_x;
 1122|   958k|  const int ssy = xd->plane[plane].subsampling_y;
 1123|   958k|  BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy);
 1124|   958k|  PREDICTION_MODE mode = interintra_to_intra_mode[xd->mi[0]->interintra_mode];
 1125|   958k|  assert(xd->mi[0]->angle_delta[PLANE_TYPE_Y] == 0);
 1126|   958k|  assert(xd->mi[0]->angle_delta[PLANE_TYPE_UV] == 0);
 1127|   958k|  assert(xd->mi[0]->filter_intra_mode_info.use_filter_intra == 0);
 1128|   958k|  assert(xd->mi[0]->use_intrabc == 0);
 1129|   958k|  const SequenceHeader *seq_params = cm->seq_params;
 1130|       |
 1131|   958k|  av1_predict_intra_block(xd, seq_params->sb_size,
 1132|   958k|                          seq_params->enable_intra_edge_filter, pd->width,
 1133|   958k|                          pd->height, max_txsize_rect_lookup[plane_bsize], mode,
 1134|   958k|                          0, 0, FILTER_INTRA_MODES, ctx->plane[plane],
 1135|   958k|                          ctx->stride[plane], dst, dst_stride, 0, 0, plane);
 1136|   958k|}
av1_combine_interintra:
 1140|   958k|                            const uint8_t *intra_pred, int intra_stride) {
 1141|   958k|  const int ssx = xd->plane[plane].subsampling_x;
 1142|   958k|  const int ssy = xd->plane[plane].subsampling_y;
 1143|   958k|  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy);
 1144|   958k|#if CONFIG_AV1_HIGHBITDEPTH
 1145|   958k|  if (is_cur_buf_hbd(xd)) {
  ------------------
  |  Branch (1145:7): [True: 525k, False: 433k]
  ------------------
 1146|   525k|    combine_interintra_highbd(
 1147|   525k|        xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra,
 1148|   525k|        xd->mi[0]->interintra_wedge_index, INTERINTRA_WEDGE_SIGN, bsize,
  ------------------
  |  |   40|   525k|#define INTERINTRA_WEDGE_SIGN 0
  ------------------
 1149|   525k|        plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
 1150|   525k|        inter_pred, inter_stride, intra_pred, intra_stride, xd->bd);
 1151|   525k|    return;
 1152|   525k|  }
 1153|   433k|#endif
 1154|   433k|  combine_interintra(
 1155|   433k|      xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra,
 1156|   433k|      xd->mi[0]->interintra_wedge_index, INTERINTRA_WEDGE_SIGN, bsize,
  ------------------
  |  |   40|   433k|#define INTERINTRA_WEDGE_SIGN 0
  ------------------
 1157|   433k|      plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
 1158|   433k|      inter_pred, inter_stride, intra_pred, intra_stride);
 1159|   433k|}
av1_build_interintra_predictor:
 1165|   958k|                                    BLOCK_SIZE bsize) {
 1166|   958k|  assert(bsize < BLOCK_SIZES_ALL);
 1167|   958k|  if (is_cur_buf_hbd(xd)) {
  ------------------
  |  Branch (1167:7): [True: 525k, False: 433k]
  ------------------
 1168|   525k|    DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]);
  ------------------
  |  |   19|   525k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 1169|   525k|    av1_build_intra_predictors_for_interintra(
 1170|   525k|        cm, xd, bsize, plane, ctx, CONVERT_TO_BYTEPTR(intrapredictor),
  ------------------
  |  |   76|   525k|#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
  ------------------
 1171|   525k|        MAX_SB_SIZE);
  ------------------
  |  |   32|   525k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   525k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
 1172|   525k|    av1_combine_interintra(xd, bsize, plane, pred, stride,
 1173|   525k|                           CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
  ------------------
  |  |   76|   525k|#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
  ------------------
                                         CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
  ------------------
  |  |   32|   525k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   525k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
 1174|   525k|  } else {
 1175|   433k|    DECLARE_ALIGNED(16, uint8_t, intrapredictor[MAX_SB_SQUARE]);
  ------------------
  |  |   19|   433k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 1176|   433k|    av1_build_intra_predictors_for_interintra(cm, xd, bsize, plane, ctx,
 1177|   433k|                                              intrapredictor, MAX_SB_SIZE);
  ------------------
  |  |   32|   433k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   433k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
 1178|   433k|    av1_combine_interintra(xd, bsize, plane, pred, stride, intrapredictor,
 1179|   433k|                           MAX_SB_SIZE);
  ------------------
  |  |   32|   433k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   433k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
 1180|   433k|  }
 1181|   958k|}
reconinter.c:allow_warp:
   37|  6.24M|                      WarpedMotionParams *final_warp_params) {
   38|       |  // Note: As per the spec, we must test the fixed point scales here, which are
   39|       |  // at a higher precision (1 << 14) than the xs and ys in subpel_params (that
   40|       |  // have 1 << 10 precision).
   41|  6.24M|  if (av1_is_scaled(sf)) return 0;
  ------------------
  |  Branch (41:7): [True: 713k, False: 5.52M]
  ------------------
   42|       |
   43|  5.52M|  if (final_warp_params != NULL) *final_warp_params = default_warp_params;
  ------------------
  |  Branch (43:7): [True: 5.52M, False: 1.36k]
  ------------------
   44|       |
   45|  5.52M|  if (build_for_obmc) return 0;
  ------------------
  |  Branch (45:7): [True: 0, False: 5.52M]
  ------------------
   46|       |
   47|  5.52M|  if (warp_types->local_warp_allowed && !mbmi->wm_params.invalid) {
  ------------------
  |  Branch (47:7): [True: 439k, False: 5.08M]
  |  Branch (47:41): [True: 373k, False: 66.1k]
  ------------------
   48|   373k|    if (final_warp_params != NULL) *final_warp_params = mbmi->wm_params;
  ------------------
  |  Branch (48:9): [True: 373k, False: 0]
  ------------------
   49|   373k|    return 1;
   50|  5.15M|  } else if (warp_types->global_warp_allowed && !gm_params->invalid) {
  ------------------
  |  Branch (50:14): [True: 137k, False: 5.01M]
  |  Branch (50:49): [True: 129k, False: 7.98k]
  ------------------
   51|   129k|    if (final_warp_params != NULL) *final_warp_params = *gm_params;
  ------------------
  |  Branch (51:9): [True: 129k, False: 0]
  ------------------
   52|   129k|    return 1;
   53|   129k|  }
   54|       |
   55|  5.02M|  return 0;
   56|  5.52M|}
reconinter.c:init_all_wedge_masks:
  594|      1|static void init_all_wedge_masks(void) {
  595|      1|  init_wedge_master_masks();
  596|      1|  init_wedge_masks();
  597|      1|  init_smooth_interintra_masks();
  598|      1|}
reconinter.c:init_wedge_master_masks:
  449|      1|static inline void init_wedge_master_masks(void) {
  450|      1|  int i, j;
  451|      1|  const int w = MASK_MASTER_SIZE;
  ------------------
  |  |  451|      1|#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
  |  |  ------------------
  |  |  |  |   41|      1|#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   40|      1|#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  452|      1|  const int h = MASK_MASTER_SIZE;
  ------------------
  |  |  451|      1|#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
  |  |  ------------------
  |  |  |  |   41|      1|#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   40|      1|#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  453|      1|  const int stride = MASK_MASTER_STRIDE;
  ------------------
  |  |  452|      1|#define MASK_MASTER_STRIDE (MASK_MASTER_SIZE)
  |  |  ------------------
  |  |  |  |  451|      1|#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
  |  |  |  |  ------------------
  |  |  |  |  |  |   41|      1|#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   40|      1|#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  454|       |  // Note: index [0] stores the masters, and [1] its complement.
  455|       |  // Generate prototype by shifting the masters
  456|      1|  int shift = h / 4;
  457|     33|  for (i = 0; i < h; i += 2) {
  ------------------
  |  Branch (457:15): [True: 32, False: 1]
  ------------------
  458|     32|    shift_copy(wedge_master_oblique_even,
  459|     32|               &wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride], shift,
  460|     32|               MASK_MASTER_SIZE);
  ------------------
  |  |  451|     32|#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
  |  |  ------------------
  |  |  |  |   41|     32|#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   40|     32|#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  461|     32|    shift--;
  462|     32|    shift_copy(wedge_master_oblique_odd,
  463|     32|               &wedge_mask_obl[0][WEDGE_OBLIQUE63][(i + 1) * stride], shift,
  464|     32|               MASK_MASTER_SIZE);
  ------------------
  |  |  451|     32|#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
  |  |  ------------------
  |  |  |  |   41|     32|#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   40|     32|#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  465|     32|    memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][i * stride],
  466|     32|           wedge_master_vertical,
  467|     32|           MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0]));
  ------------------
  |  |  451|     32|#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
  |  |  ------------------
  |  |  |  |   41|     32|#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   40|     32|#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  468|     32|    memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][(i + 1) * stride],
  469|     32|           wedge_master_vertical,
  470|     32|           MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0]));
  ------------------
  |  |  451|     32|#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
  |  |  ------------------
  |  |  |  |   41|     32|#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   40|     32|#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  471|     32|  }
  472|       |
  473|     65|  for (i = 0; i < h; ++i) {
  ------------------
  |  Branch (473:15): [True: 64, False: 1]
  ------------------
  474|  4.16k|    for (j = 0; j < w; ++j) {
  ------------------
  |  Branch (474:17): [True: 4.09k, False: 64]
  ------------------
  475|  4.09k|      const int msk = wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j];
  476|  4.09k|      wedge_mask_obl[0][WEDGE_OBLIQUE27][j * stride + i] = msk;
  477|  4.09k|      wedge_mask_obl[0][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
  478|  4.09k|          wedge_mask_obl[0][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] =
  479|  4.09k|              (1 << WEDGE_WEIGHT_BITS) - msk;
  ------------------
  |  |   44|  4.09k|#define WEDGE_WEIGHT_BITS 6
  ------------------
  480|  4.09k|      wedge_mask_obl[1][WEDGE_OBLIQUE63][i * stride + j] =
  481|  4.09k|          wedge_mask_obl[1][WEDGE_OBLIQUE27][j * stride + i] =
  482|  4.09k|              (1 << WEDGE_WEIGHT_BITS) - msk;
  ------------------
  |  |   44|  4.09k|#define WEDGE_WEIGHT_BITS 6
  ------------------
  483|  4.09k|      wedge_mask_obl[1][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
  484|  4.09k|          wedge_mask_obl[1][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] = msk;
  485|  4.09k|      const int mskx = wedge_mask_obl[0][WEDGE_VERTICAL][i * stride + j];
  486|  4.09k|      wedge_mask_obl[0][WEDGE_HORIZONTAL][j * stride + i] = mskx;
  487|  4.09k|      wedge_mask_obl[1][WEDGE_VERTICAL][i * stride + j] =
  488|  4.09k|          wedge_mask_obl[1][WEDGE_HORIZONTAL][j * stride + i] =
  489|  4.09k|              (1 << WEDGE_WEIGHT_BITS) - mskx;
  ------------------
  |  |   44|  4.09k|#define WEDGE_WEIGHT_BITS 6
  ------------------
  490|  4.09k|    }
  491|     64|  }
  492|      1|}
reconinter.c:shift_copy:
  148|     64|                              int width) {
  149|     64|  if (shift >= 0) {
  ------------------
  |  Branch (149:7): [True: 33, False: 31]
  ------------------
  150|     33|    memcpy(dst + shift, src, width - shift);
  151|     33|    memset(dst, src[0], shift);
  152|     33|  } else {
  153|     31|    shift = -shift;
  154|     31|    memcpy(dst, src + shift, width - shift);
  155|     31|    memset(dst + width - shift, src[width - 1], shift);
  156|     31|  }
  157|     64|}
reconinter.c:init_wedge_masks:
  494|      1|static inline void init_wedge_masks(void) {
  495|      1|  uint8_t *dst = wedge_mask_buf;
  496|      1|  BLOCK_SIZE bsize;
  497|      1|  memset(wedge_masks, 0, sizeof(wedge_masks));
  498|     23|  for (bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; ++bsize) {
  ------------------
  |  Branch (498:27): [True: 22, False: 1]
  ------------------
  499|     22|    const wedge_params_type *wedge_params = &av1_wedge_params_lookup[bsize];
  500|     22|    const int wtypes = wedge_params->wedge_types;
  501|     22|    if (wtypes == 0) continue;
  ------------------
  |  Branch (501:9): [True: 13, False: 9]
  ------------------
  502|      9|    const uint8_t *mask;
  503|      9|    const int bw = block_size_wide[bsize];
  504|      9|    const int bh = block_size_high[bsize];
  505|      9|    int w;
  506|    153|    for (w = 0; w < wtypes; ++w) {
  ------------------
  |  Branch (506:17): [True: 144, False: 9]
  ------------------
  507|    144|      mask = get_wedge_mask_inplace(w, 0, bsize);
  508|    144|      aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw /* dst_stride */, bw,
  ------------------
  |  |  452|    144|#define MASK_MASTER_STRIDE (MASK_MASTER_SIZE)
  |  |  ------------------
  |  |  |  |  451|    144|#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
  |  |  |  |  ------------------
  |  |  |  |  |  |   41|    144|#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   40|    144|#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  509|    144|                        bh);
  510|    144|      wedge_params->masks[0][w] = dst;
  511|    144|      dst += bw * bh;
  512|       |
  513|    144|      mask = get_wedge_mask_inplace(w, 1, bsize);
  514|    144|      aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw /* dst_stride */, bw,
  ------------------
  |  |  452|    144|#define MASK_MASTER_STRIDE (MASK_MASTER_SIZE)
  |  |  ------------------
  |  |  |  |  451|    144|#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
  |  |  |  |  ------------------
  |  |  |  |  |  |   41|    144|#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   40|    144|#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  515|    144|                        bh);
  516|    144|      wedge_params->masks[1][w] = dst;
  517|    144|      dst += bw * bh;
  518|    144|    }
  519|      9|    assert(sizeof(wedge_mask_buf) >= (size_t)(dst - wedge_mask_buf));
  520|      9|  }
  521|      1|}
reconinter.c:get_wedge_mask_inplace:
  271|    288|                                             BLOCK_SIZE sb_type) {
  272|    288|  const uint8_t *master;
  273|    288|  const int bh = block_size_high[sb_type];
  274|    288|  const int bw = block_size_wide[sb_type];
  275|    288|  const wedge_code_type *a =
  276|    288|      av1_wedge_params_lookup[sb_type].codebook + wedge_index;
  277|    288|  int woff, hoff;
  278|    288|  const uint8_t wsignflip =
  279|    288|      av1_wedge_params_lookup[sb_type].signflip[wedge_index];
  280|       |
  281|    288|  assert(wedge_index >= 0 && wedge_index < get_wedge_types_lookup(sb_type));
  282|    288|  woff = (a->x_offset * bw) >> 3;
  283|    288|  hoff = (a->y_offset * bh) >> 3;
  284|    288|  master = wedge_mask_obl[neg ^ wsignflip][a->direction] +
  285|    288|           MASK_MASTER_STRIDE * (MASK_MASTER_SIZE / 2 - hoff) +
  ------------------
  |  |  452|    288|#define MASK_MASTER_STRIDE (MASK_MASTER_SIZE)
  |  |  ------------------
  |  |  |  |  451|    288|#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
  |  |  |  |  ------------------
  |  |  |  |  |  |   41|    288|#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   40|    288|#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
                         MASK_MASTER_STRIDE * (MASK_MASTER_SIZE / 2 - hoff) +
  ------------------
  |  |  451|    288|#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
  |  |  ------------------
  |  |  |  |   41|    288|#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   40|    288|#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  286|    288|           MASK_MASTER_SIZE / 2 - woff;
  ------------------
  |  |  451|    288|#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
  |  |  ------------------
  |  |  |  |   41|    288|#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   40|    288|#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  287|    288|  return master;
  288|    288|}
reconinter.c:init_smooth_interintra_masks:
  581|      1|static inline void init_smooth_interintra_masks(void) {
  582|      5|  for (int m = 0; m < INTERINTRA_MODES; ++m) {
  ------------------
  |  Branch (582:19): [True: 4, False: 1]
  ------------------
  583|     92|    for (int bs = 0; bs < BLOCK_SIZES_ALL; ++bs) {
  ------------------
  |  Branch (583:22): [True: 88, False: 4]
  ------------------
  584|     88|      const int bw = block_size_wide[bs];
  585|     88|      const int bh = block_size_high[bs];
  586|     88|      if (bw > MAX_WEDGE_SIZE || bh > MAX_WEDGE_SIZE) continue;
  ------------------
  |  |   41|    176|#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   40|     88|#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
  |  |  ------------------
  ------------------
                    if (bw > MAX_WEDGE_SIZE || bh > MAX_WEDGE_SIZE) continue;
  ------------------
  |  |   41|     64|#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   40|     64|#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
  |  |  ------------------
  ------------------
  |  Branch (586:11): [True: 24, False: 64]
  |  Branch (586:34): [True: 8, False: 56]
  ------------------
  587|     56|      build_smooth_interintra_mask(smooth_interintra_mask_buf[m][bs], bw, bs,
  588|     56|                                   m);
  589|     56|    }
  590|      4|  }
  591|      1|}
reconinter.c:build_smooth_interintra_mask:
  542|   351k|                                                INTERINTRA_MODE mode) {
  543|   351k|  int i, j;
  544|   351k|  const int bw = block_size_wide[plane_bsize];
  545|   351k|  const int bh = block_size_high[plane_bsize];
  546|   351k|  const int size_scale = ii_size_scales[plane_bsize];
  547|       |
  548|   351k|  switch (mode) {
  549|  73.1k|    case II_V_PRED:
  ------------------
  |  Branch (549:5): [True: 73.1k, False: 278k]
  ------------------
  550|   798k|      for (i = 0; i < bh; ++i) {
  ------------------
  |  Branch (550:19): [True: 725k, False: 73.1k]
  ------------------
  551|   725k|        memset(mask, ii_weights1d[i * size_scale], bw * sizeof(mask[0]));
  552|   725k|        mask += stride;
  553|   725k|      }
  554|  73.1k|      break;
  555|       |
  556|   165k|    case II_H_PRED:
  ------------------
  |  Branch (556:5): [True: 165k, False: 186k]
  ------------------
  557|  1.83M|      for (i = 0; i < bh; ++i) {
  ------------------
  |  Branch (557:19): [True: 1.67M, False: 165k]
  ------------------
  558|  22.8M|        for (j = 0; j < bw; ++j) mask[j] = ii_weights1d[j * size_scale];
  ------------------
  |  Branch (558:21): [True: 21.2M, False: 1.67M]
  ------------------
  559|  1.67M|        mask += stride;
  560|  1.67M|      }
  561|   165k|      break;
  562|       |
  563|  57.7k|    case II_SMOOTH_PRED:
  ------------------
  |  Branch (563:5): [True: 57.7k, False: 293k]
  ------------------
  564|   639k|      for (i = 0; i < bh; ++i) {
  ------------------
  |  Branch (564:19): [True: 581k, False: 57.7k]
  ------------------
  565|  8.00M|        for (j = 0; j < bw; ++j)
  ------------------
  |  Branch (565:21): [True: 7.42M, False: 581k]
  ------------------
  566|  7.42M|          mask[j] = ii_weights1d[(i < j ? i : j) * size_scale];
  ------------------
  |  Branch (566:35): [True: 3.53M, False: 3.88M]
  ------------------
  567|   581k|        mask += stride;
  568|   581k|      }
  569|  57.7k|      break;
  570|       |
  571|  55.1k|    case II_DC_PRED:
  ------------------
  |  Branch (571:5): [True: 55.1k, False: 296k]
  ------------------
  572|  55.1k|    default:
  ------------------
  |  Branch (572:5): [True: 0, False: 351k]
  ------------------
  573|   616k|      for (i = 0; i < bh; ++i) {
  ------------------
  |  Branch (573:19): [True: 561k, False: 55.1k]
  ------------------
  574|   561k|        memset(mask, 32, bw * sizeof(mask[0]));
  575|   561k|        mask += stride;
  576|   561k|      }
  577|  55.1k|      break;
  578|   351k|  }
  579|   351k|}
reconinter.c:build_masked_compound_no_round:
  606|   410k|    int w, InterPredParams *inter_pred_params) {
  607|   410k|  const int ssy = inter_pred_params->subsampling_y;
  608|   410k|  const int ssx = inter_pred_params->subsampling_x;
  609|   410k|  const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
  610|   410k|  const int mask_stride = block_size_wide[sb_type];
  611|   410k|#if CONFIG_AV1_HIGHBITDEPTH
  612|   410k|  if (inter_pred_params->use_hbd_buf) {
  ------------------
  |  Branch (612:7): [True: 284k, False: 125k]
  ------------------
  613|   284k|    aom_highbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
  614|   284k|                                  src1_stride, mask, mask_stride, w, h, ssx,
  615|   284k|                                  ssy, &inter_pred_params->conv_params,
  616|   284k|                                  inter_pred_params->bit_depth);
  617|   284k|  } else {
  618|   125k|    aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
  619|   125k|                                 src1_stride, mask, mask_stride, w, h, ssx, ssy,
  620|   125k|                                 &inter_pred_params->conv_params);
  621|   125k|  }
  622|       |#else
  623|       |  aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
  624|       |                               src1_stride, mask, mask_stride, w, h, ssx, ssy,
  625|       |                               &inter_pred_params->conv_params);
  626|       |#endif
  627|   410k|}
reconinter.c:increment_int_ptr:
  790|  3.38M|                                     const int num_planes) {
  791|  3.38M|  (void)xd;
  792|  3.38M|  (void)rel_mi_row;
  793|  3.38M|  (void)rel_mi_col;
  794|  3.38M|  (void)op_mi_size;
  795|  3.38M|  (void)dir;
  796|  3.38M|  (void)mi;
  797|  3.38M|  ++*(uint8_t *)fun_ctxt;
  798|  3.38M|  (void)num_planes;
  799|  3.38M|}
reconinter.c:build_obmc_inter_pred_above:
  854|   627k|    int dir, MB_MODE_INFO *above_mi, void *fun_ctxt, const int num_planes) {
  855|   627k|  (void)above_mi;
  856|   627k|  (void)rel_mi_row;
  857|   627k|  (void)dir;
  858|   627k|  struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
  859|   627k|  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
  860|   627k|  const int overlap =
  861|   627k|      AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
  ------------------
  |  |   34|   627k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 618k, False: 8.99k]
  |  |  ------------------
  ------------------
  862|       |
  863|  2.50M|  for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (863:23): [True: 1.87M, False: 627k]
  ------------------
  864|  1.87M|    const struct macroblockd_plane *pd = &xd->plane[plane];
  865|  1.87M|    const int bw = (op_mi_size * MI_SIZE) >> pd->subsampling_x;
  ------------------
  |  |   40|  1.87M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  1.87M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  866|  1.87M|    const int bh = overlap >> pd->subsampling_y;
  867|  1.87M|    const int plane_col = (rel_mi_col * MI_SIZE) >> pd->subsampling_x;
  ------------------
  |  |   40|  1.87M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  1.87M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  868|       |
  869|  1.87M|    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
  ------------------
  |  Branch (869:9): [True: 856k, False: 1.02M]
  ------------------
  870|       |
  871|  1.02M|    const int dst_stride = pd->dst.stride;
  872|  1.02M|    uint8_t *const dst = &pd->dst.buf[plane_col];
  873|  1.02M|    const int tmp_stride = ctxt->adjacent_stride[plane];
  874|  1.02M|    const uint8_t *const tmp = &ctxt->adjacent[plane][plane_col];
  875|  1.02M|    const uint8_t *const mask = av1_get_obmc_mask(bh);
  876|  1.02M|#if CONFIG_AV1_HIGHBITDEPTH
  877|  1.02M|    const int is_hbd = is_cur_buf_hbd(xd);
  878|  1.02M|    if (is_hbd)
  ------------------
  |  Branch (878:9): [True: 467k, False: 554k]
  ------------------
  879|   467k|      aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp,
  880|   467k|                                 tmp_stride, mask, bw, bh, xd->bd);
  881|   554k|    else
  882|   554k|      aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
  883|   554k|                          mask, bw, bh);
  884|       |#else
  885|       |    aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask,
  886|       |                        bw, bh);
  887|       |#endif
  888|  1.02M|  }
  889|   627k|}
reconinter.c:build_obmc_inter_pred_left:
  893|   622k|    int dir, MB_MODE_INFO *left_mi, void *fun_ctxt, const int num_planes) {
  894|   622k|  (void)left_mi;
  895|   622k|  (void)rel_mi_col;
  896|   622k|  (void)dir;
  897|   622k|  struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
  898|   622k|  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
  899|   622k|  const int overlap =
  900|   622k|      AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
  ------------------
  |  |   34|   622k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 612k, False: 9.86k]
  |  |  ------------------
  ------------------
  901|       |
  902|  2.48M|  for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (902:23): [True: 1.86M, False: 622k]
  ------------------
  903|  1.86M|    const struct macroblockd_plane *pd = &xd->plane[plane];
  904|  1.86M|    const int bw = overlap >> pd->subsampling_x;
  905|  1.86M|    const int bh = (op_mi_size * MI_SIZE) >> pd->subsampling_y;
  ------------------
  |  |   40|  1.86M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  1.86M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  906|  1.86M|    const int plane_row = (rel_mi_row * MI_SIZE) >> pd->subsampling_y;
  ------------------
  |  |   40|  1.86M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  1.86M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  907|       |
  908|  1.86M|    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
  ------------------
  |  Branch (908:9): [True: 0, False: 1.86M]
  ------------------
  909|       |
  910|  1.86M|    const int dst_stride = pd->dst.stride;
  911|  1.86M|    uint8_t *const dst = &pd->dst.buf[plane_row * dst_stride];
  912|  1.86M|    const int tmp_stride = ctxt->adjacent_stride[plane];
  913|  1.86M|    const uint8_t *const tmp = &ctxt->adjacent[plane][plane_row * tmp_stride];
  914|  1.86M|    const uint8_t *const mask = av1_get_obmc_mask(bw);
  915|       |
  916|  1.86M|#if CONFIG_AV1_HIGHBITDEPTH
  917|  1.86M|    const int is_hbd = is_cur_buf_hbd(xd);
  918|  1.86M|    if (is_hbd)
  ------------------
  |  Branch (918:9): [True: 867k, False: 995k]
  ------------------
  919|   867k|      aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp,
  920|   867k|                                 tmp_stride, mask, bw, bh, xd->bd);
  921|   995k|    else
  922|   995k|      aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
  923|   995k|                          mask, bw, bh);
  924|       |#else
  925|       |    aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask,
  926|       |                        bw, bh);
  927|       |#endif
  928|  1.86M|  }
  929|   622k|}
reconinter.c:modify_neighbor_predictor_for_obmc:
  841|  1.24M|static void modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi) {
  842|  1.24M|  mbmi->ref_frame[1] = NONE_FRAME;
  843|  1.24M|  mbmi->interinter_comp.type = COMPOUND_AVERAGE;
  844|  1.24M|}
reconinter.c:combine_interintra_highbd:
 1090|   525k|    int interstride, const uint8_t *intrapred8, int intrastride, int bd) {
 1091|   525k|  const int bw = block_size_wide[plane_bsize];
 1092|   525k|  const int bh = block_size_high[plane_bsize];
 1093|       |
 1094|   525k|  if (use_wedge_interintra) {
  ------------------
  |  Branch (1094:7): [True: 174k, False: 351k]
  ------------------
 1095|   174k|    if (av1_is_wedge_used(bsize)) {
  ------------------
  |  Branch (1095:9): [True: 174k, False: 0]
  ------------------
 1096|   174k|      const uint8_t *mask =
 1097|   174k|          av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
 1098|   174k|      const int subh = 2 * mi_size_high[bsize] == bh;
 1099|   174k|      const int subw = 2 * mi_size_wide[bsize] == bw;
 1100|   174k|      aom_highbd_blend_a64_mask(comppred8, compstride, intrapred8, intrastride,
 1101|   174k|                                interpred8, interstride, mask,
 1102|   174k|                                block_size_wide[bsize], bw, bh, subw, subh, bd);
 1103|   174k|    }
 1104|   174k|    return;
 1105|   174k|  }
 1106|       |
 1107|   351k|  uint8_t mask[MAX_SB_SQUARE];
 1108|   351k|  build_smooth_interintra_mask(mask, bw, plane_bsize, mode);
 1109|   351k|  aom_highbd_blend_a64_mask(comppred8, compstride, intrapred8, intrastride,
 1110|   351k|                            interpred8, interstride, mask, bw, bw, bh, 0, 0,
 1111|   351k|                            bd);
 1112|   351k|}
reconinter.c:combine_interintra:
 1063|   433k|    int interstride, const uint8_t *intrapred, int intrastride) {
 1064|   433k|  const int bw = block_size_wide[plane_bsize];
 1065|   433k|  const int bh = block_size_high[plane_bsize];
 1066|       |
 1067|   433k|  if (use_wedge_interintra) {
  ------------------
  |  Branch (1067:7): [True: 103k, False: 330k]
  ------------------
 1068|   103k|    if (av1_is_wedge_used(bsize)) {
  ------------------
  |  Branch (1068:9): [True: 103k, False: 0]
  ------------------
 1069|   103k|      const uint8_t *mask =
 1070|   103k|          av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
 1071|   103k|      const int subw = 2 * mi_size_wide[bsize] == bw;
 1072|   103k|      const int subh = 2 * mi_size_high[bsize] == bh;
 1073|   103k|      aom_blend_a64_mask(comppred, compstride, intrapred, intrastride,
 1074|   103k|                         interpred, interstride, mask, block_size_wide[bsize],
 1075|   103k|                         bw, bh, subw, subh);
 1076|   103k|    }
 1077|   103k|    return;
 1078|   103k|  }
 1079|       |
 1080|   330k|  const uint8_t *mask = smooth_interintra_mask_buf[mode][plane_bsize];
 1081|   330k|  aom_blend_a64_mask(comppred, compstride, intrapred, intrastride, interpred,
 1082|   330k|                     interstride, mask, bw, bw, bh, 0, 0);
 1083|   330k|}

decodeframe.c:av1_init_inter_params:
  221|  17.0M|    int_interpfilters interp_filters) {
  222|  17.0M|  init_inter_block_params(inter_pred_params, block_width, block_height, pix_row,
  223|  17.0M|                          pix_col, subsampling_x, subsampling_y, bit_depth,
  224|  17.0M|                          use_hbd_buf, is_intrabc);
  225|  17.0M|  init_interp_filter_params(inter_pred_params->interp_filter_params,
  226|  17.0M|                            &interp_filters.as_filters, block_width,
  227|  17.0M|                            block_height, is_intrabc);
  228|  17.0M|  inter_pred_params->scale_factors = sf;
  229|  17.0M|  inter_pred_params->ref_frame_buf = *ref_buf;
  230|  17.0M|}
decodeframe.c:init_inter_block_params:
  199|  17.0M|                                           int is_intrabc) {
  200|  17.0M|  inter_pred_params->block_width = block_width;
  201|  17.0M|  inter_pred_params->block_height = block_height;
  202|  17.0M|  inter_pred_params->pix_row = pix_row;
  203|  17.0M|  inter_pred_params->pix_col = pix_col;
  204|  17.0M|  inter_pred_params->subsampling_x = subsampling_x;
  205|  17.0M|  inter_pred_params->subsampling_y = subsampling_y;
  206|  17.0M|  inter_pred_params->bit_depth = bit_depth;
  207|  17.0M|  inter_pred_params->use_hbd_buf = use_hbd_buf;
  208|  17.0M|  inter_pred_params->is_intrabc = is_intrabc;
  209|  17.0M|  inter_pred_params->mode = TRANSLATION_PRED;
  210|  17.0M|  inter_pred_params->comp_mode = UNIFORM_SINGLE;
  211|  17.0M|  inter_pred_params->top = -AOM_LEFT_TOP_MARGIN_SCALED(subsampling_y);
  ------------------
  |  |   32|  17.0M|  (AOM_LEFT_TOP_MARGIN_PX(subsampling) << SCALE_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   30|  17.0M|  ((AOM_BORDER_IN_PIXELS >> subsampling) - AOM_INTERP_EXTEND)
  |  |  |  |  ------------------
  |  |  |  |  |  |   32|  17.0M|#define AOM_BORDER_IN_PIXELS 288
  |  |  |  |  ------------------
  |  |  |  |                 ((AOM_BORDER_IN_PIXELS >> subsampling) - AOM_INTERP_EXTEND)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|  17.0M|#define AOM_INTERP_EXTEND 4
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (AOM_LEFT_TOP_MARGIN_PX(subsampling) << SCALE_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|  17.0M|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  ------------------
  212|  17.0M|  inter_pred_params->left = -AOM_LEFT_TOP_MARGIN_SCALED(subsampling_x);
  ------------------
  |  |   32|  17.0M|  (AOM_LEFT_TOP_MARGIN_PX(subsampling) << SCALE_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   30|  17.0M|  ((AOM_BORDER_IN_PIXELS >> subsampling) - AOM_INTERP_EXTEND)
  |  |  |  |  ------------------
  |  |  |  |  |  |   32|  17.0M|#define AOM_BORDER_IN_PIXELS 288
  |  |  |  |  ------------------
  |  |  |  |                 ((AOM_BORDER_IN_PIXELS >> subsampling) - AOM_INTERP_EXTEND)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|  17.0M|#define AOM_INTERP_EXTEND 4
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (AOM_LEFT_TOP_MARGIN_PX(subsampling) << SCALE_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|  17.0M|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  ------------------
  213|  17.0M|}
decodeframe.c:init_interp_filter_params:
  172|  17.0M|    int is_intrabc) {
  173|  17.0M|  if (UNLIKELY(is_intrabc)) {
  ------------------
  |  |   55|  17.0M|#define UNLIKELY(v) __builtin_expect(v, 0)
  |  |  ------------------
  |  |  |  Branch (55:21): [True: 147k, False: 16.8M]
  |  |  ------------------
  ------------------
  174|   147k|    interp_filter_params[0] = &av1_intrabc_filter_params;
  175|   147k|    interp_filter_params[1] = &av1_intrabc_filter_params;
  176|  16.8M|  } else {
  177|  16.8M|    interp_filter_params[0] = av1_get_interp_filter_params_with_block_size(
  178|  16.8M|        (InterpFilter)filter->x_filter, block_width);
  179|  16.8M|    interp_filter_params[1] = av1_get_interp_filter_params_with_block_size(
  180|  16.8M|        (InterpFilter)filter->y_filter, block_height);
  181|  16.8M|  }
  182|  17.0M|}
decodeframe.c:clamp_mv_to_umv_border_sb:
  345|  17.0M|                                           int ss_x, int ss_y) {
  346|       |  // If the MV points so far into the UMV border that no visible pixels
  347|       |  // are used for reconstruction, the subpel part of the MV can be
  348|       |  // discarded and the MV limited to 16 pixels with equivalent results.
  349|  17.0M|  const int spel_left = (AOM_INTERP_EXTEND + bw) << SUBPEL_BITS;
  ------------------
  |  |   31|  17.0M|#define AOM_INTERP_EXTEND 4
  ------------------
                const int spel_left = (AOM_INTERP_EXTEND + bw) << SUBPEL_BITS;
  ------------------
  |  |   23|  17.0M|#define SUBPEL_BITS 4
  ------------------
  350|  17.0M|  const int spel_right = spel_left - SUBPEL_SHIFTS;
  ------------------
  |  |   25|  17.0M|#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|  17.0M|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  351|  17.0M|  const int spel_top = (AOM_INTERP_EXTEND + bh) << SUBPEL_BITS;
  ------------------
  |  |   31|  17.0M|#define AOM_INTERP_EXTEND 4
  ------------------
                const int spel_top = (AOM_INTERP_EXTEND + bh) << SUBPEL_BITS;
  ------------------
  |  |   23|  17.0M|#define SUBPEL_BITS 4
  ------------------
  352|  17.0M|  const int spel_bottom = spel_top - SUBPEL_SHIFTS;
  ------------------
  |  |   25|  17.0M|#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|  17.0M|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  353|  17.0M|  MV clamped_mv = { (int16_t)(src_mv->row * (1 << (1 - ss_y))),
  354|  17.0M|                    (int16_t)(src_mv->col * (1 << (1 - ss_x))) };
  355|  17.0M|  assert(ss_x <= 1);
  356|  17.0M|  assert(ss_y <= 1);
  357|  17.0M|  const SubpelMvLimits mv_limits = {
  358|  17.0M|    xd->mb_to_left_edge * (1 << (1 - ss_x)) - spel_left,
  359|  17.0M|    xd->mb_to_right_edge * (1 << (1 - ss_x)) + spel_right,
  360|  17.0M|    xd->mb_to_top_edge * (1 << (1 - ss_y)) - spel_top,
  361|  17.0M|    xd->mb_to_bottom_edge * (1 << (1 - ss_y)) + spel_bottom
  362|  17.0M|  };
  363|       |
  364|  17.0M|  clamp_mv(&clamped_mv, &mv_limits);
  365|       |
  366|  17.0M|  return clamped_mv;
  367|  17.0M|}
decodeframe.c:av1_init_comp_mode:
  232|  3.36M|static inline void av1_init_comp_mode(InterPredParams *inter_pred_params) {
  233|  3.36M|  inter_pred_params->comp_mode = UNIFORM_COMP;
  234|  3.36M|}
decodemv.c:av1_is_wedge_used:
  329|   339k|static inline int av1_is_wedge_used(BLOCK_SIZE sb_type) {
  330|   339k|  return av1_wedge_params_lookup[sb_type].wedge_types > 0;
  331|   339k|}
decodemv.c:is_any_masked_compound_used:
  312|   524k|static inline int is_any_masked_compound_used(BLOCK_SIZE sb_type) {
  313|   524k|  COMPOUND_TYPE comp_type;
  314|   524k|  int i;
  315|   524k|  if (!is_comp_ref_allowed(sb_type)) return 0;
  ------------------
  |  Branch (315:7): [True: 0, False: 524k]
  ------------------
  316|  1.61M|  for (i = 0; i < COMPOUND_TYPES; i++) {
  ------------------
  |  Branch (316:15): [True: 1.61M, False: 1.34k]
  ------------------
  317|  1.61M|    comp_type = (COMPOUND_TYPE)i;
  318|  1.61M|    if (is_masked_compound_type(comp_type) &&
  ------------------
  |  Branch (318:9): [True: 574k, False: 1.04M]
  ------------------
  319|  1.61M|        is_interinter_compound_used(comp_type, sb_type))
  ------------------
  |  Branch (319:9): [True: 523k, False: 51.5k]
  ------------------
  320|   523k|      return 1;
  321|  1.61M|  }
  322|  1.34k|  return 0;
  323|   524k|}
decodemv.c:is_interinter_compound_used:
  300|   782k|                                              BLOCK_SIZE sb_type) {
  301|   782k|  const int comp_allowed = is_comp_ref_allowed(sb_type);
  302|   782k|  switch (type) {
  303|      0|    case COMPOUND_AVERAGE:
  ------------------
  |  Branch (303:5): [True: 0, False: 782k]
  ------------------
  304|      0|    case COMPOUND_DISTWTD:
  ------------------
  |  Branch (304:5): [True: 0, False: 782k]
  ------------------
  305|  50.9k|    case COMPOUND_DIFFWTD: return comp_allowed;
  ------------------
  |  Branch (305:5): [True: 50.9k, False: 731k]
  ------------------
  306|   731k|    case COMPOUND_WEDGE:
  ------------------
  |  Branch (306:5): [True: 731k, False: 51.0k]
  ------------------
  307|   731k|      return comp_allowed && av1_wedge_params_lookup[sb_type].wedge_types > 0;
  ------------------
  |  Branch (307:14): [True: 731k, False: 18.4E]
  |  Branch (307:30): [True: 661k, False: 70.2k]
  ------------------
  308|      0|    default: assert(0); return 0;
  ------------------
  |  Branch (308:5): [True: 0, False: 782k]
  ------------------
  309|   782k|  }
  310|   782k|}
decodemv.c:av1_is_interp_needed:
  420|  4.55M|static inline int av1_is_interp_needed(const MACROBLOCKD *const xd) {
  421|  4.55M|  const MB_MODE_INFO *const mbmi = xd->mi[0];
  422|  4.55M|  if (mbmi->skip_mode) return 0;
  ------------------
  |  Branch (422:7): [True: 108k, False: 4.44M]
  ------------------
  423|  4.44M|  if (mbmi->motion_mode == WARPED_CAUSAL) return 0;
  ------------------
  |  Branch (423:7): [True: 305k, False: 4.13M]
  ------------------
  424|  4.13M|  if (is_nontrans_global_motion(xd, xd->mi[0])) return 0;
  ------------------
  |  Branch (424:7): [True: 482k, False: 3.65M]
  ------------------
  425|  3.65M|  return 1;
  426|  4.13M|}
decodemv.c:set_default_interp_filters:
  415|   897k|    MB_MODE_INFO *const mbmi, InterpFilter frame_interp_filter) {
  416|   897k|  mbmi->interp_filters =
  417|   897k|      av1_broadcast_interp_filter(av1_unswitchable_filter(frame_interp_filter));
  418|   897k|}
reconinter.c:highbd_inter_predictor:
  279|  8.38M|    int bd) {
  280|  8.38M|  assert(conv_params->do_average == 0 || conv_params->do_average == 1);
  281|  8.38M|  const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
  282|  8.38M|  if (is_scaled) {
  ------------------
  |  Branch (282:7): [True: 724k, False: 7.65M]
  ------------------
  283|   724k|    av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
  284|   724k|                                  interp_filters, subpel_params->subpel_x,
  285|   724k|                                  subpel_params->xs, subpel_params->subpel_y,
  286|   724k|                                  subpel_params->ys, 1, conv_params, bd);
  287|  7.65M|  } else {
  288|  7.65M|    SubpelParams sp = *subpel_params;
  289|  7.65M|    revert_scale_extra_bits(&sp);
  290|  7.65M|    av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
  291|  7.65M|                                  interp_filters, sp.subpel_x, sp.xs,
  292|  7.65M|                                  sp.subpel_y, sp.ys, 0, conv_params, bd);
  293|  7.65M|  }
  294|  8.38M|}
reconinter.c:has_scale:
  240|  16.5M|static inline int has_scale(int xs, int ys) {
  241|  16.5M|  return xs != SCALE_SUBPEL_SHIFTS || ys != SCALE_SUBPEL_SHIFTS;
  ------------------
  |  |   29|  33.0M|#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|  16.5M|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  ------------------
                return xs != SCALE_SUBPEL_SHIFTS || ys != SCALE_SUBPEL_SHIFTS;
  ------------------
  |  |   29|  14.2M|#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|  14.2M|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  ------------------
  |  Branch (241:10): [True: 2.31M, False: 14.2M]
  |  Branch (241:39): [True: 4.54k, False: 14.2M]
  ------------------
  242|  16.5M|}
reconinter.c:revert_scale_extra_bits:
  244|  14.2M|static inline void revert_scale_extra_bits(SubpelParams *sp) {
  245|  14.2M|  sp->subpel_x >>= SCALE_EXTRA_BITS;
  ------------------
  |  |   31|  14.2M|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|  14.2M|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|  14.2M|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  246|  14.2M|  sp->subpel_y >>= SCALE_EXTRA_BITS;
  ------------------
  |  |   31|  14.2M|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|  14.2M|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|  14.2M|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  247|  14.2M|  sp->xs >>= SCALE_EXTRA_BITS;
  ------------------
  |  |   31|  14.2M|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|  14.2M|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|  14.2M|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  248|  14.2M|  sp->ys >>= SCALE_EXTRA_BITS;
  ------------------
  |  |   31|  14.2M|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|  14.2M|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|  14.2M|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  249|  14.2M|  assert(sp->subpel_x < SUBPEL_SHIFTS);
  250|  14.2M|  assert(sp->subpel_y < SUBPEL_SHIFTS);
  251|  14.2M|  assert(sp->xs <= SUBPEL_SHIFTS);
  252|  14.2M|  assert(sp->ys <= SUBPEL_SHIFTS);
  253|  14.2M|}
reconinter.c:inter_predictor:
  258|  8.14M|    ConvolveParams *conv_params, const InterpFilterParams *interp_filters[2]) {
  259|  8.14M|  assert(conv_params->do_average == 0 || conv_params->do_average == 1);
  260|  8.14M|  const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
  261|  8.14M|  if (is_scaled) {
  ------------------
  |  Branch (261:7): [True: 1.59M, False: 6.55M]
  ------------------
  262|  1.59M|    av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
  263|  1.59M|                           interp_filters, subpel_params->subpel_x,
  264|  1.59M|                           subpel_params->xs, subpel_params->subpel_y,
  265|  1.59M|                           subpel_params->ys, 1, conv_params);
  266|  6.55M|  } else {
  267|  6.55M|    SubpelParams sp = *subpel_params;
  268|  6.55M|    revert_scale_extra_bits(&sp);
  269|  6.55M|    av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
  270|  6.55M|                           interp_filters, sp.subpel_x, sp.xs, sp.subpel_y,
  271|  6.55M|                           sp.ys, 0, conv_params);
  272|  6.55M|  }
  273|  8.14M|}
reconinter.c:av1_get_contiguous_soft_mask:
  458|   442k|                                                          BLOCK_SIZE sb_type) {
  459|   442k|  return av1_wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index];
  460|   442k|}
reconinter.c:get_wedge_types_lookup:
  325|    288|static inline int get_wedge_types_lookup(BLOCK_SIZE sb_type) {
  326|    288|  return av1_wedge_params_lookup[sb_type].wedge_types;
  327|    288|}
reconinter.c:setup_pred_plane:
  390|  91.6M|                                    int subsampling_x, int subsampling_y) {
  391|       |  // Offset the buffer pointer
  392|  91.6M|  if (subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1))
  ------------------
  |  Branch (392:7): [True: 49.1M, False: 42.5M]
  |  Branch (392:24): [True: 3.48M, False: 45.6M]
  |  Branch (392:43): [True: 3.48M, False: 18.4E]
  ------------------
  393|  3.48M|    mi_row -= 1;
  394|  91.6M|  if (subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1))
  ------------------
  |  Branch (394:7): [True: 49.1M, False: 42.5M]
  |  Branch (394:24): [True: 2.95M, False: 46.2M]
  |  Branch (394:43): [True: 2.95M, False: 18.4E]
  ------------------
  395|  2.95M|    mi_col -= 1;
  396|       |
  397|  91.6M|  const int x = (MI_SIZE * mi_col) >> subsampling_x;
  ------------------
  |  |   40|  91.6M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  91.6M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  398|  91.6M|  const int y = (MI_SIZE * mi_row) >> subsampling_y;
  ------------------
  |  |   40|  91.6M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  91.6M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  399|  91.6M|  dst->buf = src + scaled_buffer_offset(x, y, stride, scale);
  400|  91.6M|  dst->buf0 = src;
  401|  91.6M|  dst->width = width;
  402|  91.6M|  dst->height = height;
  403|  91.6M|  dst->stride = stride;
  404|  91.6M|}
reconinter.c:scaled_buffer_offset:
  371|  91.6M|                                           const struct scale_factors *sf) {
  372|  91.6M|  int x, y;
  373|  91.6M|  if (!sf) {
  ------------------
  |  Branch (373:7): [True: 73.8M, False: 17.7M]
  ------------------
  374|  73.8M|    x = x_offset;
  375|  73.8M|    y = y_offset;
  376|  73.8M|  } else if (av1_is_scaled(sf)) {
  ------------------
  |  Branch (376:14): [True: 2.47M, False: 15.3M]
  ------------------
  377|  2.47M|    x = av1_scaled_x(x_offset, sf) >> SCALE_EXTRA_BITS;
  ------------------
  |  |   31|  2.47M|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|  2.47M|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|  2.47M|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  378|  2.47M|    y = av1_scaled_y(y_offset, sf) >> SCALE_EXTRA_BITS;
  ------------------
  |  |   31|  2.47M|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|  2.47M|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|  2.47M|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  379|  15.3M|  } else {
  380|  15.3M|    x = av1_unscaled_value(x_offset, sf) >> SCALE_EXTRA_BITS;
  ------------------
  |  |   31|  15.3M|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|  15.3M|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|  15.3M|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  381|  15.3M|    y = av1_unscaled_value(y_offset, sf) >> SCALE_EXTRA_BITS;
  ------------------
  |  |   31|  15.3M|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|  15.3M|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|  15.3M|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  382|  15.3M|  }
  383|  91.6M|  return (int64_t)y * stride + x;
  384|  91.6M|}
reconinter.c:av1_is_wedge_used:
  329|   277k|static inline int av1_is_wedge_used(BLOCK_SIZE sb_type) {
  330|   277k|  return av1_wedge_params_lookup[sb_type].wedge_types > 0;
  331|   277k|}

decodeframe.c:build_inter_predictors:
  246|  14.0M|                                          uint8_t **mc_buf) {
  247|  14.0M|  if (is_sub8x8_inter(xd, plane, mi->bsize, is_intrabc_block(mi),
  ------------------
  |  Branch (247:7): [True: 1.09M, False: 12.9M]
  ------------------
  248|  14.0M|                      build_for_obmc)) {
  249|  1.09M|    assert(bw < 8 || bh < 8);
  250|  1.09M|    build_inter_predictors_sub8x8(cm, xd, plane, mi, mi_x, mi_y, mc_buf);
  251|  12.9M|  } else {
  252|  12.9M|    build_inter_predictors_8x8_and_bigger(cm, xd, plane, mi, build_for_obmc, bw,
  253|  12.9M|                                          bh, mi_x, mi_y, mc_buf);
  254|  12.9M|  }
  255|  14.0M|}
decodeframe.c:is_sub8x8_inter:
   55|  14.0M|                            int is_intrabc, int build_for_obmc) {
   56|  14.0M|  if (is_intrabc || build_for_obmc) {
  ------------------
  |  Branch (56:7): [True: 155k, False: 13.9M]
  |  Branch (56:21): [True: 2.87M, False: 11.0M]
  ------------------
   57|  3.03M|    return false;
   58|  3.03M|  }
   59|       |
   60|  11.0M|  const struct macroblockd_plane *const pd = &xd->plane[plane];
   61|  11.0M|  const int ss_x = pd->subsampling_x;
   62|  11.0M|  const int ss_y = pd->subsampling_y;
   63|  11.0M|  const int is_sub4_x = (block_size_wide[bsize] == 4) && ss_x;
  ------------------
  |  Branch (63:25): [True: 1.39M, False: 9.64M]
  |  Branch (63:58): [True: 619k, False: 773k]
  ------------------
   64|  11.0M|  const int is_sub4_y = (block_size_high[bsize] == 4) && ss_y;
  ------------------
  |  Branch (64:25): [True: 1.65M, False: 9.38M]
  |  Branch (64:58): [True: 751k, False: 906k]
  ------------------
   65|  11.0M|  if (!is_sub4_x && !is_sub4_y) {
  ------------------
  |  Branch (65:7): [True: 10.4M, False: 620k]
  |  Branch (65:21): [True: 9.78M, False: 632k]
  ------------------
   66|  9.78M|    return false;
   67|  9.78M|  }
   68|       |
   69|       |  // For sub8x8 chroma blocks, we may be covering more than one luma block's
   70|       |  // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
   71|       |  // the top-left corner of the prediction source - the correct top-left corner
   72|       |  // is at (pre_x, pre_y).
   73|  1.25M|  const int row_start = is_sub4_y ? -1 : 0;
  ------------------
  |  Branch (73:25): [True: 751k, False: 502k]
  ------------------
   74|  1.25M|  const int col_start = is_sub4_x ? -1 : 0;
  ------------------
  |  Branch (74:25): [True: 619k, False: 634k]
  ------------------
   75|       |
   76|  3.01M|  for (int row = row_start; row <= 0; ++row) {
  ------------------
  |  Branch (76:29): [True: 1.91M, False: 1.10M]
  ------------------
   77|  4.32M|    for (int col = col_start; col <= 0; ++col) {
  ------------------
  |  Branch (77:31): [True: 2.55M, False: 1.76M]
  ------------------
   78|  2.55M|      const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
   79|  2.55M|      if (!is_inter_block(this_mbmi)) return false;
  ------------------
  |  Branch (79:11): [True: 152k, False: 2.40M]
  ------------------
   80|  2.40M|      if (is_intrabc_block(this_mbmi)) return false;
  ------------------
  |  Branch (80:11): [True: 0, False: 2.40M]
  ------------------
   81|  2.40M|    }
   82|  1.91M|  }
   83|  1.10M|  return true;
   84|  1.25M|}
decodeframe.c:build_inter_predictors_sub8x8:
   91|  1.09M|                                                 uint8_t **mc_buf) {
   92|       |#else
   93|       |static inline void build_inter_predictors_sub8x8(const AV1_COMMON *cm,
   94|       |                                                 MACROBLOCKD *xd, int plane,
   95|       |                                                 const MB_MODE_INFO *mi,
   96|       |                                                 int mi_x, int mi_y) {
   97|       |#endif  // IS_DEC
   98|  1.09M|  const BLOCK_SIZE bsize = mi->bsize;
   99|  1.09M|  struct macroblockd_plane *const pd = &xd->plane[plane];
  100|  1.09M|  const bool ss_x = pd->subsampling_x;
  101|  1.09M|  const bool ss_y = pd->subsampling_y;
  102|  1.09M|  const int b4_w = block_size_wide[bsize] >> ss_x;
  103|  1.09M|  const int b4_h = block_size_high[bsize] >> ss_y;
  104|  1.09M|  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
  105|  1.09M|  const int b8_w = block_size_wide[plane_bsize];
  106|  1.09M|  const int b8_h = block_size_high[plane_bsize];
  107|  1.09M|  const int is_compound = has_second_ref(mi);
  108|  1.09M|  assert(!is_compound);
  109|  1.09M|  assert(!is_intrabc_block(mi));
  110|       |
  111|       |  // For sub8x8 chroma blocks, we may be covering more than one luma block's
  112|       |  // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
  113|       |  // the top-left corner of the prediction source - the correct top-left corner
  114|       |  // is at (pre_x, pre_y).
  115|  1.09M|  const int row_start = (block_size_high[bsize] == 4) && ss_y ? -1 : 0;
  ------------------
  |  Branch (115:25): [True: 659k, False: 440k]
  |  Branch (115:58): [True: 659k, False: 95]
  ------------------
  116|  1.09M|  const int col_start = (block_size_wide[bsize] == 4) && ss_x ? -1 : 0;
  ------------------
  |  Branch (116:25): [True: 535k, False: 564k]
  |  Branch (116:58): [True: 535k, False: 0]
  ------------------
  117|  1.09M|  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
  ------------------
  |  |   40|  1.09M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  1.09M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  118|  1.09M|  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
  ------------------
  |  |   40|  1.09M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  1.09M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  119|       |
  120|  1.09M|  int row = row_start;
  121|  2.85M|  for (int y = 0; y < b8_h; y += b4_h) {
  ------------------
  |  Branch (121:19): [True: 1.75M, False: 1.09M]
  ------------------
  122|  1.75M|    int col = col_start;
  123|  4.14M|    for (int x = 0; x < b8_w; x += b4_w) {
  ------------------
  |  Branch (123:21): [True: 2.38M, False: 1.75M]
  ------------------
  124|  2.38M|      MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
  125|  2.38M|      struct buf_2d *const dst_buf = &pd->dst;
  126|  2.38M|      uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
  127|  2.38M|      int ref = 0;
  128|  2.38M|      const RefCntBuffer *ref_buf =
  129|  2.38M|          get_ref_frame_buf(cm, this_mbmi->ref_frame[ref]);
  130|  2.38M|      const struct scale_factors *ref_scale_factors =
  131|  2.38M|          get_ref_scale_factors_const(cm, this_mbmi->ref_frame[ref]);
  132|  2.38M|      const struct scale_factors *const sf = ref_scale_factors;
  133|  2.38M|      const struct buf_2d pre_buf = {
  134|  2.38M|        NULL,
  135|  2.38M|        (plane == 1) ? ref_buf->buf.u_buffer : ref_buf->buf.v_buffer,
  ------------------
  |  Branch (135:9): [True: 1.19M, False: 1.19M]
  ------------------
  136|  2.38M|        ref_buf->buf.uv_crop_width,
  137|  2.38M|        ref_buf->buf.uv_crop_height,
  138|  2.38M|        ref_buf->buf.uv_stride,
  139|  2.38M|      };
  140|       |
  141|  2.38M|      const MV mv = this_mbmi->mv[ref].as_mv;
  142|       |
  143|  2.38M|      InterPredParams inter_pred_params;
  144|  2.38M|      av1_init_inter_params(&inter_pred_params, b4_w, b4_h, pre_y + y,
  145|  2.38M|                            pre_x + x, pd->subsampling_x, pd->subsampling_y,
  146|  2.38M|                            xd->bd, is_cur_buf_hbd(xd), mi->use_intrabc, sf,
  147|  2.38M|                            &pre_buf, this_mbmi->interp_filters);
  148|  2.38M|      inter_pred_params.conv_params =
  149|  2.38M|          get_conv_params_no_round(ref, plane, NULL, 0, is_compound, xd->bd);
  150|       |
  151|  2.38M|#if IS_DEC
  152|  2.38M|      build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params,
  153|  2.38M|                                xd, mi_x + x, mi_y + y, ref, mc_buf);
  154|       |#else
  155|       |      build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params);
  156|       |#endif  // IS_DEC
  157|       |
  158|  2.38M|      ++col;
  159|  2.38M|    }
  160|  1.75M|    ++row;
  161|  1.75M|  }
  162|  1.09M|}
decodeframe.c:build_one_inter_predictor:
   22|  17.0M|                                             uint8_t **mc_buf) {
   23|       |#else
   24|       |static inline void build_one_inter_predictor(
   25|       |    uint8_t *dst, int dst_stride, const MV *src_mv,
   26|       |    InterPredParams *inter_pred_params) {
   27|       |#endif  // IS_DEC
   28|  17.0M|  SubpelParams subpel_params;
   29|  17.0M|  uint8_t *src;
   30|  17.0M|  int src_stride;
   31|  17.0M|#if IS_DEC
   32|  17.0M|  dec_calc_subpel_params_and_extend(src_mv, inter_pred_params, xd, mi_x, mi_y,
   33|  17.0M|                                    ref, mc_buf, &src, &subpel_params,
   34|  17.0M|                                    &src_stride);
   35|       |#else
   36|       |  enc_calc_subpel_params(src_mv, inter_pred_params, &src, &subpel_params,
   37|       |                         &src_stride);
   38|       |#endif  // IS_DEC
   39|  17.0M|  if (inter_pred_params->comp_mode == UNIFORM_SINGLE ||
  ------------------
  |  Branch (39:7): [True: 13.6M, False: 3.36M]
  ------------------
   40|  17.0M|      inter_pred_params->comp_mode == UNIFORM_COMP) {
  ------------------
  |  Branch (40:7): [True: 2.95M, False: 410k]
  ------------------
   41|  16.6M|    av1_make_inter_predictor(src, src_stride, dst, dst_stride,
   42|  16.6M|                             inter_pred_params, &subpel_params);
   43|  16.6M|  } else {
   44|   413k|    av1_make_masked_inter_predictor(src, src_stride, dst, dst_stride,
   45|   413k|                                    inter_pred_params, &subpel_params);
   46|   413k|  }
   47|  17.0M|}
decodeframe.c:build_inter_predictors_8x8_and_bigger:
  167|  12.9M|    int build_for_obmc, int bw, int bh, int mi_x, int mi_y, uint8_t **mc_buf) {
  168|       |#else
  169|       |static inline void build_inter_predictors_8x8_and_bigger(
  170|       |    const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi,
  171|       |    int build_for_obmc, int bw, int bh, int mi_x, int mi_y) {
  172|       |#endif  // IS_DEC
  173|  12.9M|  const int is_compound = has_second_ref(mi);
  174|  12.9M|  const int is_intrabc = is_intrabc_block(mi);
  175|  12.9M|  assert(IMPLIES(is_intrabc, !is_compound));
  176|  12.9M|  struct macroblockd_plane *const pd = &xd->plane[plane];
  177|  12.9M|  struct buf_2d *const dst_buf = &pd->dst;
  178|  12.9M|  uint8_t *const dst = dst_buf->buf;
  179|       |
  180|  12.9M|  int is_global[2] = { 0, 0 };
  181|  27.6M|  for (int ref = 0; ref < 1 + is_compound; ++ref) {
  ------------------
  |  Branch (181:21): [True: 14.6M, False: 12.9M]
  ------------------
  182|  14.6M|    const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
  183|  14.6M|    is_global[ref] = is_global_mv_block(mi, wm->wmtype);
  184|  14.6M|  }
  185|       |
  186|  12.9M|  const BLOCK_SIZE bsize = mi->bsize;
  187|  12.9M|  const int ss_x = pd->subsampling_x;
  188|  12.9M|  const int ss_y = pd->subsampling_y;
  189|  12.9M|  const int row_start =
  190|  12.9M|      (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
  ------------------
  |  Branch (190:7): [True: 1.26M, False: 11.6M]
  |  Branch (190:40): [True: 218k, False: 1.04M]
  |  Branch (190:48): [True: 94.8k, False: 124k]
  ------------------
  191|  12.9M|  const int col_start =
  192|  12.9M|      (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
  ------------------
  |  Branch (192:7): [True: 1.08M, False: 11.8M]
  |  Branch (192:40): [True: 187k, False: 893k]
  |  Branch (192:48): [True: 84.7k, False: 103k]
  ------------------
  193|  12.9M|  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
  ------------------
  |  |   40|  12.9M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  12.9M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  194|  12.9M|  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
  ------------------
  |  |   40|  12.9M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  12.9M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  195|       |
  196|  27.6M|  for (int ref = 0; ref < 1 + is_compound; ++ref) {
  ------------------
  |  Branch (196:21): [True: 14.6M, False: 12.9M]
  ------------------
  197|  14.6M|    const struct scale_factors *const sf =
  198|  14.6M|        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref];
  ------------------
  |  Branch (198:9): [True: 147k, False: 14.4M]
  ------------------
  199|  14.6M|    struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
  ------------------
  |  Branch (199:36): [True: 147k, False: 14.4M]
  ------------------
  200|  14.6M|    const MV mv = mi->mv[ref].as_mv;
  201|  14.6M|    const WarpTypesAllowed warp_types = { is_global[ref],
  202|  14.6M|                                          mi->motion_mode == WARPED_CAUSAL };
  203|       |
  204|  14.6M|    InterPredParams inter_pred_params;
  205|  14.6M|    av1_init_inter_params(&inter_pred_params, bw, bh, pre_y, pre_x,
  206|  14.6M|                          pd->subsampling_x, pd->subsampling_y, xd->bd,
  207|  14.6M|                          is_cur_buf_hbd(xd), mi->use_intrabc, sf, pre_buf,
  208|  14.6M|                          mi->interp_filters);
  209|  14.6M|    if (is_compound) av1_init_comp_mode(&inter_pred_params);
  ------------------
  |  Branch (209:9): [True: 3.36M, False: 11.2M]
  ------------------
  210|  14.6M|    inter_pred_params.conv_params = get_conv_params_no_round(
  211|  14.6M|        ref, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
  ------------------
  |  |   32|  14.6M|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  14.6M|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  212|       |
  213|  14.6M|    av1_dist_wtd_comp_weight_assign(
  214|  14.6M|        cm, mi, &inter_pred_params.conv_params.fwd_offset,
  215|  14.6M|        &inter_pred_params.conv_params.bck_offset,
  216|  14.6M|        &inter_pred_params.conv_params.use_dist_wtd_comp_avg, is_compound);
  217|       |
  218|  14.6M|    if (!build_for_obmc)
  ------------------
  |  Branch (218:9): [True: 11.7M, False: 2.87M]
  ------------------
  219|  11.7M|      av1_init_warp_params(&inter_pred_params, &warp_types, ref, xd, mi);
  220|       |
  221|  14.6M|    if (is_masked_compound_type(mi->interinter_comp.type)) {
  ------------------
  |  Branch (221:9): [True: 821k, False: 13.8M]
  ------------------
  222|   821k|      inter_pred_params.sb_type = mi->bsize;
  223|   821k|      inter_pred_params.mask_comp = mi->interinter_comp;
  224|   821k|      if (ref == 1) {
  ------------------
  |  Branch (224:11): [True: 410k, False: 410k]
  ------------------
  225|   410k|        inter_pred_params.conv_params.do_average = 0;
  226|   410k|        inter_pred_params.comp_mode = MASK_COMP;
  227|   410k|      }
  228|       |      // Assign physical buffer.
  229|   821k|      inter_pred_params.mask_comp.seg_mask = xd->seg_mask;
  230|   821k|    }
  231|       |
  232|  14.6M|#if IS_DEC
  233|  14.6M|    build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params, xd,
  234|  14.6M|                              mi_x, mi_y, ref, mc_buf);
  235|       |#else
  236|       |    build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params);
  237|       |#endif  // IS_DEC
  238|  14.6M|  }
  239|  12.9M|}

av1_predict_intra_block:
 1695|   113M|                             int plane) {
 1696|   113M|  const MB_MODE_INFO *const mbmi = xd->mi[0];
 1697|   113M|  const int txwpx = tx_size_wide[tx_size];
 1698|   113M|  const int txhpx = tx_size_high[tx_size];
 1699|   113M|  const int x = col_off << MI_SIZE_LOG2;
  ------------------
  |  |   39|   113M|#define MI_SIZE_LOG2 2
  ------------------
 1700|   113M|  const int y = row_off << MI_SIZE_LOG2;
  ------------------
  |  |   39|   113M|#define MI_SIZE_LOG2 2
  ------------------
 1701|   113M|  const int is_hbd = is_cur_buf_hbd(xd);
 1702|       |
 1703|   113M|  assert(mode < INTRA_MODES);
 1704|       |
 1705|   113M|  if (use_palette) {
  ------------------
  |  Branch (1705:7): [True: 252k, False: 113M]
  ------------------
 1706|   252k|    int r, c;
 1707|   252k|    const uint8_t *const map = xd->plane[plane != 0].color_index_map +
 1708|   252k|                               xd->color_index_map_offset[plane != 0];
 1709|   252k|    const uint16_t *const palette =
 1710|   252k|        mbmi->palette_mode_info.palette_colors + plane * PALETTE_MAX_SIZE;
  ------------------
  |  |   63|   252k|#define PALETTE_MAX_SIZE 8
  ------------------
 1711|   252k|    if (is_hbd) {
  ------------------
  |  Branch (1711:9): [True: 148k, False: 103k]
  ------------------
 1712|   148k|      uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
  ------------------
  |  |   75|   148k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 1713|  1.51M|      for (r = 0; r < txhpx; ++r) {
  ------------------
  |  Branch (1713:19): [True: 1.36M, False: 148k]
  ------------------
 1714|  17.9M|        for (c = 0; c < txwpx; ++c) {
  ------------------
  |  Branch (1714:21): [True: 16.5M, False: 1.36M]
  ------------------
 1715|  16.5M|          dst16[r * dst_stride + c] = palette[map[(r + y) * wpx + c + x]];
 1716|  16.5M|        }
 1717|  1.36M|      }
 1718|   148k|    } else {
 1719|   991k|      for (r = 0; r < txhpx; ++r) {
  ------------------
  |  Branch (1719:19): [True: 887k, False: 103k]
  ------------------
 1720|  11.2M|        for (c = 0; c < txwpx; ++c) {
  ------------------
  |  Branch (1720:21): [True: 10.3M, False: 887k]
  ------------------
 1721|  10.3M|          dst[r * dst_stride + c] =
 1722|  10.3M|              (uint8_t)palette[map[(r + y) * wpx + c + x]];
 1723|  10.3M|        }
 1724|   887k|      }
 1725|   103k|    }
 1726|   252k|    return;
 1727|   252k|  }
 1728|       |
 1729|   113M|  const struct macroblockd_plane *const pd = &xd->plane[plane];
 1730|   113M|  const int ss_x = pd->subsampling_x;
 1731|   113M|  const int ss_y = pd->subsampling_y;
 1732|   113M|  const int have_top =
 1733|   113M|      row_off || (ss_y ? xd->chroma_up_available : xd->up_available);
  ------------------
  |  Branch (1733:7): [True: 78.8M, False: 34.3M]
  |  Branch (1733:18): [True: 32.8M, False: 1.55M]
  |  Branch (1733:19): [True: 9.70M, False: 24.6M]
  ------------------
 1734|   113M|  const int have_left =
 1735|   113M|      col_off || (ss_x ? xd->chroma_left_available : xd->left_available);
  ------------------
  |  Branch (1735:7): [True: 77.3M, False: 35.9M]
  |  Branch (1735:18): [True: 33.2M, False: 2.64M]
  |  Branch (1735:19): [True: 9.73M, False: 26.1M]
  ------------------
 1736|       |
 1737|       |  // Distance between the right edge of this prediction block to
 1738|       |  // the frame right edge
 1739|   113M|  const int xr = (xd->mb_to_right_edge >> (3 + ss_x)) + wpx - x - txwpx;
 1740|       |  // Distance between the bottom edge of this prediction block to
 1741|       |  // the frame bottom edge
 1742|   113M|  const int yd = (xd->mb_to_bottom_edge >> (3 + ss_y)) + hpx - y - txhpx;
 1743|   113M|  const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
 1744|   113M|  const int is_dr_mode = av1_is_directional_mode(mode);
 1745|       |
 1746|       |  // The computations in this function, as well as in build_intra_predictors(),
 1747|       |  // are generalized for all intra modes. Some of these operations are not
 1748|       |  // required since non-directional intra modes (i.e., DC, SMOOTH, SMOOTH_H,
 1749|       |  // SMOOTH_V, and PAETH) specifically require left and top neighbors. Hence, a
 1750|       |  // separate function build_non_directional_intra_predictors() is introduced
 1751|       |  // for these modes to avoid redundant computations while generating pred data.
 1752|       |
 1753|   113M|  const int n_top_px = have_top ? AOMMIN(txwpx, xr + txwpx) : 0;
  ------------------
  |  |   34|   111M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 109M, False: 2.26M]
  |  |  ------------------
  ------------------
  |  Branch (1753:24): [True: 111M, False: 1.54M]
  ------------------
 1754|   113M|  const int n_left_px = have_left ? AOMMIN(txhpx, yd + txhpx) : 0;
  ------------------
  |  |   34|   110M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 110M, False: 465k]
  |  |  ------------------
  ------------------
  |  Branch (1754:25): [True: 110M, False: 2.64M]
  ------------------
 1755|   113M|  if (!use_filter_intra && !is_dr_mode) {
  ------------------
  |  Branch (1755:7): [True: 111M, False: 1.40M]
  |  Branch (1755:28): [True: 100M, False: 10.9M]
  ------------------
 1756|   100M|#if CONFIG_AV1_HIGHBITDEPTH
 1757|   100M|    if (is_hbd) {
  ------------------
  |  Branch (1757:9): [True: 88.3M, False: 12.5M]
  ------------------
 1758|  88.3M|      highbd_build_non_directional_intra_predictors(
 1759|  88.3M|          ref, ref_stride, dst, dst_stride, mode, tx_size, n_top_px, n_left_px,
 1760|  88.3M|          xd->bd);
 1761|  88.3M|      return;
 1762|  88.3M|    }
 1763|  12.5M|#endif  // CONFIG_AV1_HIGHBITDEPTH
 1764|  12.5M|    build_non_directional_intra_predictors(ref, ref_stride, dst, dst_stride,
 1765|  12.5M|                                           mode, tx_size, n_top_px, n_left_px);
 1766|  12.5M|    return;
 1767|   100M|  }
 1768|       |
 1769|  12.3M|  const int txw = tx_size_wide_unit[tx_size];
 1770|  12.3M|  const int txh = tx_size_high_unit[tx_size];
 1771|  12.3M|  const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
  ------------------
  |  |   39|  12.3M|#define MI_SIZE_LOG2 2
  ------------------
 1772|  12.3M|  const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
  ------------------
  |  |   39|  12.3M|#define MI_SIZE_LOG2 2
  ------------------
 1773|  12.3M|  const int right_available =
 1774|  12.3M|      mi_col + ((col_off + txw) << ss_x) < xd->tile.mi_col_end;
 1775|  12.3M|  const int bottom_available =
 1776|  12.3M|      (yd > 0) && (mi_row + ((row_off + txh) << ss_y) < xd->tile.mi_row_end);
  ------------------
  |  Branch (1776:7): [True: 12.2M, False: 136k]
  |  Branch (1776:19): [True: 12.1M, False: 29.2k]
  ------------------
 1777|       |
 1778|  12.3M|  const PARTITION_TYPE partition = mbmi->partition;
 1779|       |
 1780|  12.3M|  BLOCK_SIZE bsize = mbmi->bsize;
 1781|       |  // force 4x4 chroma component block size.
 1782|  12.3M|  if (ss_x || ss_y) {
  ------------------
  |  Branch (1782:7): [True: 3.33M, False: 9.02M]
  |  Branch (1782:15): [True: 141, False: 9.02M]
  ------------------
 1783|  3.35M|    bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
 1784|  3.35M|  }
 1785|       |
 1786|  12.3M|  int p_angle = 0;
 1787|  12.3M|  int need_top_right = extend_modes[mode] & NEED_ABOVERIGHT;
 1788|  12.3M|  int need_bottom_left = extend_modes[mode] & NEED_BOTTOMLEFT;
 1789|       |
 1790|  12.3M|  if (use_filter_intra) {
  ------------------
  |  Branch (1790:7): [True: 1.41M, False: 10.9M]
  ------------------
 1791|  1.41M|    need_top_right = 0;
 1792|  1.41M|    need_bottom_left = 0;
 1793|  1.41M|  }
 1794|  12.3M|  if (is_dr_mode) {
  ------------------
  |  Branch (1794:7): [True: 10.9M, False: 1.39M]
  ------------------
 1795|  10.9M|    p_angle = mode_to_angle_map[mode] + angle_delta;
 1796|  10.9M|    need_top_right = p_angle < 90;
 1797|  10.9M|    need_bottom_left = p_angle > 180;
 1798|  10.9M|  }
 1799|       |
 1800|       |  // Possible states for have_top_right(TR) and have_bottom_left(BL)
 1801|       |  // -1 : TR and BL are not needed
 1802|       |  //  0 : TR and BL are needed but not available
 1803|       |  // > 0 : TR and BL are needed and pixels are available
 1804|  12.3M|  const int have_top_right =
 1805|  12.3M|      need_top_right ? has_top_right(sb_size, bsize, mi_row, mi_col, have_top,
  ------------------
  |  Branch (1805:7): [True: 1.29M, False: 11.0M]
  ------------------
 1806|  1.29M|                                     right_available, partition, tx_size,
 1807|  1.29M|                                     row_off, col_off, ss_x, ss_y)
 1808|  12.3M|                     : -1;
 1809|  12.3M|  const int have_bottom_left =
 1810|  12.3M|      need_bottom_left ? has_bottom_left(sb_size, bsize, mi_row, mi_col,
  ------------------
  |  Branch (1810:7): [True: 2.08M, False: 10.2M]
  ------------------
 1811|  2.08M|                                         bottom_available, have_left, partition,
 1812|  2.08M|                                         tx_size, row_off, col_off, ss_x, ss_y)
 1813|  12.3M|                       : -1;
 1814|       |
 1815|  12.3M|  const int disable_edge_filter = !enable_intra_edge_filter;
 1816|  12.3M|  const int intra_edge_filter_type = get_intra_edge_filter_type(xd, plane);
 1817|  12.3M|  const int n_topright_px =
 1818|  12.3M|      have_top_right > 0 ? AOMMIN(txwpx, xr) : have_top_right;
  ------------------
  |  |   34|   699k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 687k, False: 11.8k]
  |  |  ------------------
  ------------------
  |  Branch (1818:7): [True: 699k, False: 11.6M]
  ------------------
 1819|  12.3M|  const int n_bottomleft_px =
 1820|  12.3M|      have_bottom_left > 0 ? AOMMIN(txhpx, yd) : have_bottom_left;
  ------------------
  |  |   34|   700k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 694k, False: 5.16k]
  |  |  ------------------
  ------------------
  |  Branch (1820:7): [True: 700k, False: 11.6M]
  ------------------
 1821|  12.3M|#if CONFIG_AV1_HIGHBITDEPTH
 1822|  12.3M|  if (is_hbd) {
  ------------------
  |  Branch (1822:7): [True: 7.16M, False: 5.19M]
  ------------------
 1823|  7.16M|    highbd_build_directional_and_filter_intra_predictors(
 1824|  7.16M|        ref, ref_stride, dst, dst_stride, mode, p_angle, filter_intra_mode,
 1825|  7.16M|        tx_size, disable_edge_filter, n_top_px, n_topright_px, n_left_px,
 1826|  7.16M|        n_bottomleft_px, intra_edge_filter_type, xd->bd);
 1827|  7.16M|    return;
 1828|  7.16M|  }
 1829|  5.19M|#endif
 1830|  5.19M|  build_directional_and_filter_intra_predictors(
 1831|  5.19M|      ref, ref_stride, dst, dst_stride, mode, p_angle, filter_intra_mode,
 1832|  5.19M|      tx_size, disable_edge_filter, n_top_px, n_topright_px, n_left_px,
 1833|  5.19M|      n_bottomleft_px, intra_edge_filter_type);
 1834|  5.19M|}
av1_predict_intra_block_facade:
 1838|   112M|                                    TX_SIZE tx_size) {
 1839|   112M|  const MB_MODE_INFO *const mbmi = xd->mi[0];
 1840|   112M|  struct macroblockd_plane *const pd = &xd->plane[plane];
 1841|   112M|  const int dst_stride = pd->dst.stride;
 1842|   112M|  uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
  ------------------
  |  |   39|   112M|#define MI_SIZE_LOG2 2
  ------------------
 1843|   112M|  const PREDICTION_MODE mode =
 1844|   112M|      (plane == AOM_PLANE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode);
  ------------------
  |  |  226|   112M|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  |  Branch (1844:7): [True: 39.0M, False: 73.4M]
  ------------------
 1845|   112M|  const int use_palette = mbmi->palette_mode_info.palette_size[plane != 0] > 0;
 1846|   112M|  const FILTER_INTRA_MODE filter_intra_mode =
 1847|   112M|      (plane == AOM_PLANE_Y && mbmi->filter_intra_mode_info.use_filter_intra)
  ------------------
  |  |  226|   225M|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  |  Branch (1847:8): [True: 39.0M, False: 73.4M]
  |  Branch (1847:32): [True: 1.41M, False: 37.6M]
  ------------------
 1848|   112M|          ? mbmi->filter_intra_mode_info.filter_intra_mode
 1849|   112M|          : FILTER_INTRA_MODES;
 1850|   112M|  const int angle_delta = mbmi->angle_delta[plane != AOM_PLANE_Y] * ANGLE_STEP;
  ------------------
  |  |  226|   112M|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
                const int angle_delta = mbmi->angle_delta[plane != AOM_PLANE_Y] * ANGLE_STEP;
  ------------------
  |  |  468|   112M|#define ANGLE_STEP 3
  ------------------
 1851|   112M|  const SequenceHeader *seq_params = cm->seq_params;
 1852|       |
 1853|   112M|#if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
 1854|   112M|  if (plane != AOM_PLANE_Y && mbmi->uv_mode == UV_CFL_PRED) {
  ------------------
  |  |  226|   225M|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  |  Branch (1854:7): [True: 73.4M, False: 39.0M]
  |  Branch (1854:31): [True: 2.97M, False: 70.5M]
  ------------------
 1855|       |#if CONFIG_DEBUG
 1856|       |    assert(is_cfl_allowed(xd));
 1857|       |    const BLOCK_SIZE plane_bsize =
 1858|       |        get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
 1859|       |    (void)plane_bsize;
 1860|       |    assert(plane_bsize < BLOCK_SIZES_ALL);
 1861|       |    if (!xd->lossless[mbmi->segment_id]) {
 1862|       |      assert(blk_col == 0);
 1863|       |      assert(blk_row == 0);
 1864|       |      assert(block_size_wide[plane_bsize] == tx_size_wide[tx_size]);
 1865|       |      assert(block_size_high[plane_bsize] == tx_size_high[tx_size]);
 1866|       |    }
 1867|       |#endif
 1868|  2.97M|    CFL_CTX *const cfl = &xd->cfl;
 1869|  2.97M|    CFL_PRED_TYPE pred_plane = get_cfl_pred_type(plane);
 1870|  2.97M|    if (!cfl->dc_pred_is_cached[pred_plane]) {
  ------------------
  |  Branch (1870:9): [True: 2.97M, False: 18.4E]
  ------------------
 1871|  2.97M|      av1_predict_intra_block(xd, seq_params->sb_size,
 1872|  2.97M|                              seq_params->enable_intra_edge_filter, pd->width,
 1873|  2.97M|                              pd->height, tx_size, mode, angle_delta,
 1874|  2.97M|                              use_palette, filter_intra_mode, dst, dst_stride,
 1875|  2.97M|                              dst, dst_stride, blk_col, blk_row, plane);
 1876|  2.97M|      if (cfl->use_dc_pred_cache) {
  ------------------
  |  Branch (1876:11): [True: 0, False: 2.97M]
  ------------------
 1877|      0|        cfl_store_dc_pred(xd, dst, pred_plane, tx_size_wide[tx_size]);
 1878|      0|        cfl->dc_pred_is_cached[pred_plane] = true;
 1879|      0|      }
 1880|  18.4E|    } else {
 1881|  18.4E|      cfl_load_dc_pred(xd, dst, dst_stride, tx_size, pred_plane);
 1882|  18.4E|    }
 1883|  2.97M|    av1_cfl_predict_block(xd, dst, dst_stride, tx_size, plane);
 1884|  2.97M|    return;
 1885|  2.97M|  }
 1886|   109M|#endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
 1887|   109M|  av1_predict_intra_block(
 1888|   109M|      xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, pd->width,
 1889|   109M|      pd->height, tx_size, mode, angle_delta, use_palette, filter_intra_mode,
 1890|   109M|      dst, dst_stride, dst, dst_stride, blk_col, blk_row, plane);
 1891|   109M|}
av1_init_intra_predictors:
 1893|  16.1k|void av1_init_intra_predictors(void) {
 1894|  16.1k|  aom_once(init_intra_predictors_internal);
 1895|  16.1k|}
reconintra.c:highbd_build_non_directional_intra_predictors:
 1557|  88.3M|    int bit_depth) {
 1558|  88.3M|  int i = 0;
 1559|  88.3M|  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  ------------------
  |  |   75|  88.3M|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 1560|  88.3M|  const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
  ------------------
  |  |   75|  88.3M|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 1561|  88.3M|  const int txwpx = tx_size_wide[tx_size];
 1562|  88.3M|  const int txhpx = tx_size_high[tx_size];
 1563|  88.3M|  int need_left = extend_modes[mode] & NEED_LEFT;
 1564|  88.3M|  int need_above = extend_modes[mode] & NEED_ABOVE;
 1565|  88.3M|  int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
 1566|  88.3M|  const uint16_t *above_ref = ref - ref_stride;
 1567|  88.3M|  const uint16_t *left_ref = ref - 1;
 1568|  88.3M|  const int base = 128 << (bit_depth - 8);
 1569|       |
 1570|  88.3M|  assert(n_top_px >= 0);
 1571|  88.3M|  assert(n_left_px >= 0);
 1572|  88.3M|  assert(mode == DC_PRED || mode == SMOOTH_PRED || mode == SMOOTH_V_PRED ||
 1573|  88.3M|         mode == SMOOTH_H_PRED || mode == PAETH_PRED);
 1574|       |
 1575|  88.3M|  if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
  ------------------
  |  Branch (1575:8): [True: 0, False: 88.3M]
  |  Branch (1575:23): [True: 0, False: 0]
  |  Branch (1575:43): [True: 0, False: 88.3M]
  |  Branch (1575:57): [True: 0, False: 0]
  ------------------
 1576|      0|    int val = 0;
 1577|      0|    if (need_left) {
  ------------------
  |  Branch (1577:9): [True: 0, False: 0]
  ------------------
 1578|      0|      val = (n_top_px > 0) ? above_ref[0] : base + 1;
  ------------------
  |  Branch (1578:13): [True: 0, False: 0]
  ------------------
 1579|      0|    } else {
 1580|      0|      val = (n_left_px > 0) ? left_ref[0] : base - 1;
  ------------------
  |  Branch (1580:13): [True: 0, False: 0]
  ------------------
 1581|      0|    }
 1582|      0|    for (i = 0; i < txhpx; ++i) {
  ------------------
  |  Branch (1582:17): [True: 0, False: 0]
  ------------------
 1583|      0|      aom_memset16(dst, val, txwpx);
 1584|      0|      dst += dst_stride;
 1585|      0|    }
 1586|      0|    return;
 1587|      0|  }
 1588|       |
 1589|  88.3M|  DECLARE_ALIGNED(16, uint16_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
  ------------------
  |  |   19|  88.3M|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 1590|  88.3M|  DECLARE_ALIGNED(16, uint16_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
  ------------------
  |  |   19|  88.3M|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 1591|  88.3M|  uint16_t *const above_row = above_data + 16;
 1592|  88.3M|  uint16_t *const left_col = left_data + 16;
 1593|       |
 1594|  88.3M|  if (need_left) {
  ------------------
  |  Branch (1594:7): [True: 88.3M, False: 17]
  ------------------
 1595|  88.3M|    aom_memset16(left_data, base + 1, NUM_INTRA_NEIGHBOUR_PIXELS);
  ------------------
  |  |   38|  88.3M|#define NUM_INTRA_NEIGHBOUR_PIXELS (MAX_TX_SIZE * 2 + 32)
  |  |  ------------------
  |  |  |  |  183|  88.3M|#define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |  182|  88.3M|#define MAX_TX_SIZE_LOG2 (6)
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1596|  88.3M|    if (n_left_px > 0) {
  ------------------
  |  Branch (1596:9): [True: 86.5M, False: 1.84M]
  ------------------
 1597|   608M|      for (i = 0; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
  ------------------
  |  Branch (1597:19): [True: 521M, False: 86.5M]
  ------------------
 1598|  86.5M|      if (i < txhpx) aom_memset16(&left_col[i], left_col[i - 1], txhpx - i);
  ------------------
  |  Branch (1598:11): [True: 81.6k, False: 86.4M]
  ------------------
 1599|  86.5M|    } else if (n_top_px > 0) {
  ------------------
  |  Branch (1599:16): [True: 1.75M, False: 91.2k]
  ------------------
 1600|  1.75M|      aom_memset16(left_col, above_ref[0], txhpx);
 1601|  1.75M|    }
 1602|  88.3M|  }
 1603|       |
 1604|  88.3M|  if (need_above) {
  ------------------
  |  Branch (1604:7): [True: 88.3M, False: 682]
  ------------------
 1605|  88.3M|    aom_memset16(above_data, base - 1, NUM_INTRA_NEIGHBOUR_PIXELS);
  ------------------
  |  |   38|  88.3M|#define NUM_INTRA_NEIGHBOUR_PIXELS (MAX_TX_SIZE * 2 + 32)
  |  |  ------------------
  |  |  |  |  183|  88.3M|#define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |  182|  88.3M|#define MAX_TX_SIZE_LOG2 (6)
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1606|  88.3M|    if (n_top_px > 0) {
  ------------------
  |  Branch (1606:9): [True: 87.6M, False: 707k]
  ------------------
 1607|  87.6M|      memcpy(above_row, above_ref, n_top_px * sizeof(above_ref[0]));
 1608|  87.6M|      i = n_top_px;
 1609|  87.6M|      if (i < txwpx) aom_memset16(&above_row[i], above_row[i - 1], (txwpx - i));
  ------------------
  |  Branch (1609:11): [True: 103k, False: 87.5M]
  ------------------
 1610|  87.6M|    } else if (n_left_px > 0) {
  ------------------
  |  Branch (1610:16): [True: 617k, False: 89.8k]
  ------------------
 1611|   617k|      aom_memset16(above_row, left_ref[0], txwpx);
 1612|   617k|    }
 1613|  88.3M|  }
 1614|       |
 1615|  88.3M|  if (need_above_left) {
  ------------------
  |  Branch (1615:7): [True: 3.04M, False: 85.3M]
  ------------------
 1616|  3.04M|    if (n_top_px > 0 && n_left_px > 0) {
  ------------------
  |  Branch (1616:9): [True: 2.94M, False: 94.5k]
  |  Branch (1616:25): [True: 2.87M, False: 71.0k]
  ------------------
 1617|  2.87M|      above_row[-1] = above_ref[-1];
 1618|  2.87M|    } else if (n_top_px > 0) {
  ------------------
  |  Branch (1618:16): [True: 71.0k, False: 94.5k]
  ------------------
 1619|  71.0k|      above_row[-1] = above_ref[0];
 1620|  94.5k|    } else if (n_left_px > 0) {
  ------------------
  |  Branch (1620:16): [True: 92.3k, False: 2.25k]
  ------------------
 1621|  92.3k|      above_row[-1] = left_ref[0];
 1622|  92.3k|    } else {
 1623|  2.25k|      above_row[-1] = base;
 1624|  2.25k|    }
 1625|  3.04M|    left_col[-1] = above_row[-1];
 1626|  3.04M|  }
 1627|       |
 1628|  88.3M|  if (mode == DC_PRED) {
  ------------------
  |  Branch (1628:7): [True: 82.3M, False: 5.98M]
  ------------------
 1629|  82.3M|    dc_pred_high[n_left_px > 0][n_top_px > 0][tx_size](
 1630|  82.3M|        dst, dst_stride, above_row, left_col, bit_depth);
 1631|  82.3M|  } else {
 1632|  5.98M|    pred_high[mode][tx_size](dst, dst_stride, above_row, left_col, bit_depth);
 1633|  5.98M|  }
 1634|  88.3M|}
reconintra.c:build_non_directional_intra_predictors:
 1250|  12.5M|    PREDICTION_MODE mode, TX_SIZE tx_size, int n_top_px, int n_left_px) {
 1251|  12.5M|  const uint8_t *above_ref = ref - ref_stride;
 1252|  12.5M|  const uint8_t *left_ref = ref - 1;
 1253|  12.5M|  const int txwpx = tx_size_wide[tx_size];
 1254|  12.5M|  const int txhpx = tx_size_high[tx_size];
 1255|  12.5M|  const int need_left = extend_modes[mode] & NEED_LEFT;
 1256|  12.5M|  const int need_above = extend_modes[mode] & NEED_ABOVE;
 1257|  12.5M|  const int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
 1258|  12.5M|  int i = 0;
 1259|  12.5M|  assert(n_top_px >= 0);
 1260|  12.5M|  assert(n_left_px >= 0);
 1261|  12.5M|  assert(mode == DC_PRED || mode == SMOOTH_PRED || mode == SMOOTH_V_PRED ||
 1262|  12.5M|         mode == SMOOTH_H_PRED || mode == PAETH_PRED);
 1263|       |
 1264|  12.5M|  if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
  ------------------
  |  Branch (1264:8): [True: 0, False: 12.5M]
  |  Branch (1264:23): [True: 0, False: 0]
  |  Branch (1264:43): [True: 0, False: 12.5M]
  |  Branch (1264:57): [True: 0, False: 0]
  ------------------
 1265|      0|    int val = 0;
 1266|      0|    if (need_left) {
  ------------------
  |  Branch (1266:9): [True: 0, False: 0]
  ------------------
 1267|      0|      val = (n_top_px > 0) ? above_ref[0] : 129;
  ------------------
  |  Branch (1267:13): [True: 0, False: 0]
  ------------------
 1268|      0|    } else {
 1269|      0|      val = (n_left_px > 0) ? left_ref[0] : 127;
  ------------------
  |  Branch (1269:13): [True: 0, False: 0]
  ------------------
 1270|      0|    }
 1271|      0|    for (i = 0; i < txhpx; ++i) {
  ------------------
  |  Branch (1271:17): [True: 0, False: 0]
  ------------------
 1272|      0|      memset(dst, val, txwpx);
 1273|      0|      dst += dst_stride;
 1274|      0|    }
 1275|      0|    return;
 1276|      0|  }
 1277|       |
 1278|  12.5M|  DECLARE_ALIGNED(16, uint8_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
  ------------------
  |  |   19|  12.5M|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 1279|  12.5M|  DECLARE_ALIGNED(16, uint8_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
  ------------------
  |  |   19|  12.5M|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 1280|  12.5M|  uint8_t *const above_row = above_data + 16;
 1281|  12.5M|  uint8_t *const left_col = left_data + 16;
 1282|       |
 1283|  12.5M|  if (need_left) {
  ------------------
  |  Branch (1283:7): [True: 12.5M, False: 200]
  ------------------
 1284|  12.5M|    memset(left_data, 129, NUM_INTRA_NEIGHBOUR_PIXELS);
  ------------------
  |  |   38|  12.5M|#define NUM_INTRA_NEIGHBOUR_PIXELS (MAX_TX_SIZE * 2 + 32)
  |  |  ------------------
  |  |  |  |  183|  12.5M|#define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |  182|  12.5M|#define MAX_TX_SIZE_LOG2 (6)
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1285|  12.5M|    if (n_left_px > 0) {
  ------------------
  |  Branch (1285:9): [True: 12.1M, False: 360k]
  ------------------
 1286|   200M|      for (i = 0; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
  ------------------
  |  Branch (1286:19): [True: 188M, False: 12.1M]
  ------------------
 1287|  12.1M|      if (i < txhpx) memset(&left_col[i], left_col[i - 1], txhpx - i);
  ------------------
  |  Branch (1287:11): [True: 70.3k, False: 12.1M]
  ------------------
 1288|  12.1M|    } else if (n_top_px > 0) {
  ------------------
  |  Branch (1288:16): [True: 274k, False: 85.9k]
  ------------------
 1289|   274k|      memset(left_col, above_ref[0], txhpx);
 1290|   274k|    }
 1291|  12.5M|  }
 1292|       |
 1293|  12.5M|  if (need_above) {
  ------------------
  |  Branch (1293:7): [True: 12.5M, False: 149]
  ------------------
 1294|  12.5M|    memset(above_data, 127, NUM_INTRA_NEIGHBOUR_PIXELS);
  ------------------
  |  |   38|  12.5M|#define NUM_INTRA_NEIGHBOUR_PIXELS (MAX_TX_SIZE * 2 + 32)
  |  |  ------------------
  |  |  |  |  183|  12.5M|#define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |  182|  12.5M|#define MAX_TX_SIZE_LOG2 (6)
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1295|  12.5M|    if (n_top_px > 0) {
  ------------------
  |  Branch (1295:9): [True: 12.1M, False: 419k]
  ------------------
 1296|  12.1M|      memcpy(above_row, above_ref, n_top_px);
 1297|  12.1M|      i = n_top_px;
 1298|  12.1M|      if (i < txwpx) memset(&above_row[i], above_row[i - 1], txwpx - i);
  ------------------
  |  Branch (1298:11): [True: 93.4k, False: 12.0M]
  ------------------
 1299|  12.1M|    } else if (n_left_px > 0) {
  ------------------
  |  Branch (1299:16): [True: 333k, False: 85.8k]
  ------------------
 1300|   333k|      memset(above_row, left_ref[0], txwpx);
 1301|   333k|    }
 1302|  12.5M|  }
 1303|       |
 1304|  12.5M|  if (need_above_left) {
  ------------------
  |  Branch (1304:7): [True: 2.59M, False: 9.95M]
  ------------------
 1305|  2.59M|    if (n_top_px > 0 && n_left_px > 0) {
  ------------------
  |  Branch (1305:9): [True: 2.53M, False: 56.3k]
  |  Branch (1305:25): [True: 2.46M, False: 68.6k]
  ------------------
 1306|  2.46M|      above_row[-1] = above_ref[-1];
 1307|  2.46M|    } else if (n_top_px > 0) {
  ------------------
  |  Branch (1307:16): [True: 68.6k, False: 56.3k]
  ------------------
 1308|  68.6k|      above_row[-1] = above_ref[0];
 1309|  68.6k|    } else if (n_left_px > 0) {
  ------------------
  |  Branch (1309:16): [True: 52.3k, False: 4.01k]
  ------------------
 1310|  52.3k|      above_row[-1] = left_ref[0];
 1311|  52.3k|    } else {
 1312|  4.01k|      above_row[-1] = 128;
 1313|  4.01k|    }
 1314|  2.59M|    left_col[-1] = above_row[-1];
 1315|  2.59M|  }
 1316|       |
 1317|  12.5M|  if (mode == DC_PRED) {
  ------------------
  |  Branch (1317:7): [True: 8.08M, False: 4.45M]
  ------------------
 1318|  8.08M|    dc_pred[n_left_px > 0][n_top_px > 0][tx_size](dst, dst_stride, above_row,
 1319|  8.08M|                                                  left_col);
 1320|  8.08M|  } else {
 1321|  4.45M|    pred[mode][tx_size](dst, dst_stride, above_row, left_col);
 1322|  4.45M|  }
 1323|  12.5M|}
reconintra.c:scale_chroma_bsize:
 1638|  3.35M|                                            int subsampling_y) {
 1639|  3.35M|  assert(subsampling_x >= 0 && subsampling_x < 2);
 1640|  3.35M|  assert(subsampling_y >= 0 && subsampling_y < 2);
 1641|  3.35M|  BLOCK_SIZE bs = bsize;
 1642|  3.35M|  switch (bsize) {
 1643|  16.8k|    case BLOCK_4X4:
  ------------------
  |  Branch (1643:5): [True: 16.8k, False: 3.34M]
  ------------------
 1644|  16.8k|      if (subsampling_x == 1 && subsampling_y == 1)
  ------------------
  |  Branch (1644:11): [True: 16.8k, False: 0]
  |  Branch (1644:33): [True: 16.7k, False: 82]
  ------------------
 1645|  16.7k|        bs = BLOCK_8X8;
 1646|     82|      else if (subsampling_x == 1)
  ------------------
  |  Branch (1646:16): [True: 82, False: 0]
  ------------------
 1647|     82|        bs = BLOCK_8X4;
 1648|      0|      else if (subsampling_y == 1)
  ------------------
  |  Branch (1648:16): [True: 0, False: 0]
  ------------------
 1649|      0|        bs = BLOCK_4X8;
 1650|  16.8k|      break;
 1651|  30.9k|    case BLOCK_4X8:
  ------------------
  |  Branch (1651:5): [True: 30.9k, False: 3.32M]
  ------------------
 1652|  30.9k|      if (subsampling_x == 1 && subsampling_y == 1)
  ------------------
  |  Branch (1652:11): [True: 30.9k, False: 0]
  |  Branch (1652:33): [True: 30.9k, False: 0]
  ------------------
 1653|  30.9k|        bs = BLOCK_8X8;
 1654|      0|      else if (subsampling_x == 1)
  ------------------
  |  Branch (1654:16): [True: 0, False: 0]
  ------------------
 1655|      0|        bs = BLOCK_8X8;
 1656|      0|      else if (subsampling_y == 1)
  ------------------
  |  Branch (1656:16): [True: 0, False: 0]
  ------------------
 1657|      0|        bs = BLOCK_4X8;
 1658|  30.9k|      break;
 1659|  48.1k|    case BLOCK_8X4:
  ------------------
  |  Branch (1659:5): [True: 48.1k, False: 3.31M]
  ------------------
 1660|  48.1k|      if (subsampling_x == 1 && subsampling_y == 1)
  ------------------
  |  Branch (1660:11): [True: 48.1k, False: 0]
  |  Branch (1660:33): [True: 47.0k, False: 1.14k]
  ------------------
 1661|  47.0k|        bs = BLOCK_8X8;
 1662|  1.14k|      else if (subsampling_x == 1)
  ------------------
  |  Branch (1662:16): [True: 1.14k, False: 0]
  ------------------
 1663|  1.14k|        bs = BLOCK_8X4;
 1664|      0|      else if (subsampling_y == 1)
  ------------------
  |  Branch (1664:16): [True: 0, False: 0]
  ------------------
 1665|      0|        bs = BLOCK_8X8;
 1666|  48.1k|      break;
 1667|  36.6k|    case BLOCK_4X16:
  ------------------
  |  Branch (1667:5): [True: 36.6k, False: 3.32M]
  ------------------
 1668|  36.6k|      if (subsampling_x == 1 && subsampling_y == 1)
  ------------------
  |  Branch (1668:11): [True: 36.6k, False: 0]
  |  Branch (1668:33): [True: 36.6k, False: 0]
  ------------------
 1669|  36.6k|        bs = BLOCK_8X16;
 1670|      0|      else if (subsampling_x == 1)
  ------------------
  |  Branch (1670:16): [True: 0, False: 0]
  ------------------
 1671|      0|        bs = BLOCK_8X16;
 1672|      0|      else if (subsampling_y == 1)
  ------------------
  |  Branch (1672:16): [True: 0, False: 0]
  ------------------
 1673|      0|        bs = BLOCK_4X16;
 1674|  36.6k|      break;
 1675|  65.8k|    case BLOCK_16X4:
  ------------------
  |  Branch (1675:5): [True: 65.8k, False: 3.29M]
  ------------------
 1676|  65.8k|      if (subsampling_x == 1 && subsampling_y == 1)
  ------------------
  |  Branch (1676:11): [True: 65.8k, False: 0]
  |  Branch (1676:33): [True: 65.1k, False: 700]
  ------------------
 1677|  65.1k|        bs = BLOCK_16X8;
 1678|    700|      else if (subsampling_x == 1)
  ------------------
  |  Branch (1678:16): [True: 700, False: 0]
  ------------------
 1679|    700|        bs = BLOCK_16X4;
 1680|      0|      else if (subsampling_y == 1)
  ------------------
  |  Branch (1680:16): [True: 0, False: 0]
  ------------------
 1681|      0|        bs = BLOCK_16X8;
 1682|  65.8k|      break;
 1683|  3.16M|    default: break;
  ------------------
  |  Branch (1683:5): [True: 3.16M, False: 198k]
  ------------------
 1684|  3.35M|  }
 1685|  3.35M|  return bs;
 1686|  3.35M|}
reconintra.c:has_top_right:
  199|  1.29M|                         int col_off, int ss_x, int ss_y) {
  200|  1.29M|  if (!top_available || !right_available) return 0;
  ------------------
  |  Branch (200:7): [True: 65.1k, False: 1.23M]
  |  Branch (200:25): [True: 22.5k, False: 1.21M]
  ------------------
  201|       |
  202|  1.21M|  const int bw_unit = mi_size_wide[bsize];
  203|  1.21M|  const int plane_bw_unit = AOMMAX(bw_unit >> ss_x, 1);
  ------------------
  |  |   35|  1.21M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 1.04M, False: 165k]
  |  |  ------------------
  ------------------
  204|  1.21M|  const int top_right_count_unit = tx_size_wide_unit[txsz];
  205|       |
  206|  1.21M|  if (row_off > 0) {  // Just need to check if enough pixels on the right.
  ------------------
  |  Branch (206:7): [True: 221k, False: 990k]
  ------------------
  207|   221k|    if (block_size_wide[bsize] > block_size_wide[BLOCK_64X64]) {
  ------------------
  |  Branch (207:9): [True: 113k, False: 107k]
  ------------------
  208|       |      // Special case: For 128x128 blocks, the transform unit whose
  209|       |      // top-right corner is at the center of the block does in fact have
  210|       |      // pixels available at its top-right corner.
  211|   113k|      if (row_off == mi_size_high[BLOCK_64X64] >> ss_y &&
  ------------------
  |  Branch (211:11): [True: 37.0k, False: 76.7k]
  ------------------
  212|   113k|          col_off + top_right_count_unit == mi_size_wide[BLOCK_64X64] >> ss_x) {
  ------------------
  |  Branch (212:11): [True: 16.4k, False: 20.5k]
  ------------------
  213|  16.4k|        return 1;
  214|  16.4k|      }
  215|  97.2k|      const int plane_bw_unit_64 = mi_size_wide[BLOCK_64X64] >> ss_x;
  216|  97.2k|      const int col_off_64 = col_off % plane_bw_unit_64;
  217|  97.2k|      return col_off_64 + top_right_count_unit < plane_bw_unit_64;
  218|   113k|    }
  219|   107k|    return col_off + top_right_count_unit < plane_bw_unit;
  220|   990k|  } else {
  221|       |    // All top-right pixels are in the block above, which is already available.
  222|   990k|    if (col_off + top_right_count_unit < plane_bw_unit) return 1;
  ------------------
  |  Branch (222:9): [True: 51.3k, False: 938k]
  ------------------
  223|       |
  224|   938k|    const int bw_in_mi_log2 = mi_size_wide_log2[bsize];
  225|   938k|    const int bh_in_mi_log2 = mi_size_high_log2[bsize];
  226|   938k|    const int sb_mi_size = mi_size_high[sb_size];
  227|   938k|    const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2;
  228|   938k|    const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2;
  229|       |
  230|       |    // Top row of superblock: so top-right pixels are in the top and/or
  231|       |    // top-right superblocks, both of which are already available.
  232|   938k|    if (blk_row_in_sb == 0) return 1;
  ------------------
  |  Branch (232:9): [True: 115k, False: 823k]
  ------------------
  233|       |
  234|       |    // Rightmost column of superblock (and not the top row): so top-right pixels
  235|       |    // fall in the right superblock, which is not available yet.
  236|   823k|    if (((blk_col_in_sb + 1) << bw_in_mi_log2) >= sb_mi_size) {
  ------------------
  |  Branch (236:9): [True: 142k, False: 681k]
  ------------------
  237|   142k|      return 0;
  238|   142k|    }
  239|       |
  240|       |    // General case (neither top row nor rightmost column): check if the
  241|       |    // top-right block is coded before the current block.
  242|   681k|    const int this_blk_index =
  243|   681k|        ((blk_row_in_sb + 0) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) +
  ------------------
  |  |   43|   681k|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   681k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   681k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  244|   681k|        blk_col_in_sb + 0;
  245|   681k|    const int idx1 = this_blk_index / 8;
  246|   681k|    const int idx2 = this_blk_index % 8;
  247|   681k|    const uint8_t *has_tr_table = get_has_tr_table(partition, bsize);
  248|   681k|    return (has_tr_table[idx1] >> idx2) & 1;
  249|   823k|  }
  250|  1.21M|}
reconintra.c:get_has_tr_table:
  183|   681k|                                       BLOCK_SIZE bsize) {
  184|   681k|  const uint8_t *ret = NULL;
  185|       |  // If this is a mixed vertical partition, look up bsize in orders_vert.
  186|   681k|  if (partition == PARTITION_VERT_A || partition == PARTITION_VERT_B) {
  ------------------
  |  Branch (186:7): [True: 31.6k, False: 649k]
  |  Branch (186:40): [True: 32.6k, False: 617k]
  ------------------
  187|  64.3k|    assert(bsize < BLOCK_SIZES);
  188|  64.3k|    ret = has_tr_vert_tables[bsize];
  189|   617k|  } else {
  190|   617k|    ret = has_tr_tables[bsize];
  191|   617k|  }
  192|   681k|  assert(ret);
  193|   681k|  return ret;
  194|   681k|}
reconintra.c:has_bottom_left:
  384|  2.08M|                           int col_off, int ss_x, int ss_y) {
  385|  2.08M|  if (!bottom_available || !left_available) return 0;
  ------------------
  |  Branch (385:7): [True: 32.3k, False: 2.05M]
  |  Branch (385:28): [True: 191k, False: 1.86M]
  ------------------
  386|       |
  387|       |  // Special case for 128x* blocks, when col_off is half the block width.
  388|       |  // This is needed because 128x* superblocks are divided into 64x* blocks in
  389|       |  // raster order
  390|  1.86M|  if (block_size_wide[bsize] > block_size_wide[BLOCK_64X64] && col_off > 0) {
  ------------------
  |  Branch (390:7): [True: 198k, False: 1.66M]
  |  Branch (390:64): [True: 170k, False: 27.2k]
  ------------------
  391|   170k|    const int plane_bw_unit_64 = mi_size_wide[BLOCK_64X64] >> ss_x;
  392|   170k|    const int col_off_64 = col_off % plane_bw_unit_64;
  393|   170k|    if (col_off_64 == 0) {
  ------------------
  |  Branch (393:9): [True: 70.3k, False: 100k]
  ------------------
  394|       |      // We are at the left edge of top-right or bottom-right 64x* block.
  395|  70.3k|      const int plane_bh_unit_64 = mi_size_high[BLOCK_64X64] >> ss_y;
  396|  70.3k|      const int row_off_64 = row_off % plane_bh_unit_64;
  397|  70.3k|      const int plane_bh_unit =
  398|  70.3k|          AOMMIN(mi_size_high[bsize] >> ss_y, plane_bh_unit_64);
  ------------------
  |  |   34|  70.3k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 70.3k]
  |  |  ------------------
  ------------------
  399|       |      // Check if all bottom-left pixels are in the left 64x* block (which is
  400|       |      // already coded).
  401|  70.3k|      return row_off_64 + tx_size_high_unit[txsz] < plane_bh_unit;
  402|  70.3k|    }
  403|   170k|  }
  404|       |
  405|  1.79M|  if (col_off > 0) {
  ------------------
  |  Branch (405:7): [True: 339k, False: 1.45M]
  ------------------
  406|       |    // Bottom-left pixels are in the bottom-left block, which is not available.
  407|   339k|    return 0;
  408|  1.45M|  } else {
  409|  1.45M|    const int bh_unit = mi_size_high[bsize];
  410|  1.45M|    const int plane_bh_unit = AOMMAX(bh_unit >> ss_y, 1);
  ------------------
  |  |   35|  1.45M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 1.15M, False: 302k]
  |  |  ------------------
  ------------------
  411|  1.45M|    const int bottom_left_count_unit = tx_size_high_unit[txsz];
  412|       |
  413|       |    // All bottom-left pixels are in the left block, which is already available.
  414|  1.45M|    if (row_off + bottom_left_count_unit < plane_bh_unit) return 1;
  ------------------
  |  Branch (414:9): [True: 76.1k, False: 1.37M]
  ------------------
  415|       |
  416|  1.37M|    const int bw_in_mi_log2 = mi_size_wide_log2[bsize];
  417|  1.37M|    const int bh_in_mi_log2 = mi_size_high_log2[bsize];
  418|  1.37M|    const int sb_mi_size = mi_size_high[sb_size];
  419|  1.37M|    const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2;
  420|  1.37M|    const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2;
  421|       |
  422|       |    // Leftmost column of superblock: so bottom-left pixels maybe in the left
  423|       |    // and/or bottom-left superblocks. But only the left superblock is
  424|       |    // available, so check if all required pixels fall in that superblock.
  425|  1.37M|    if (blk_col_in_sb == 0) {
  ------------------
  |  Branch (425:9): [True: 227k, False: 1.15M]
  ------------------
  426|   227k|      const int blk_start_row_off =
  427|   227k|          blk_row_in_sb << (bh_in_mi_log2 + MI_SIZE_LOG2 - MI_SIZE_LOG2) >>
  ------------------
  |  |   39|   227k|#define MI_SIZE_LOG2 2
  ------------------
                        blk_row_in_sb << (bh_in_mi_log2 + MI_SIZE_LOG2 - MI_SIZE_LOG2) >>
  ------------------
  |  |   39|   227k|#define MI_SIZE_LOG2 2
  ------------------
  428|   227k|          ss_y;
  429|   227k|      const int row_off_in_sb = blk_start_row_off + row_off;
  430|   227k|      const int sb_height_unit = sb_mi_size >> ss_y;
  431|   227k|      return row_off_in_sb + bottom_left_count_unit < sb_height_unit;
  432|   227k|    }
  433|       |
  434|       |    // Bottom row of superblock (and not the leftmost column): so bottom-left
  435|       |    // pixels fall in the bottom superblock, which is not available yet.
  436|  1.15M|    if (((blk_row_in_sb + 1) << bh_in_mi_log2) >= sb_mi_size) return 0;
  ------------------
  |  Branch (436:9): [True: 163k, False: 987k]
  ------------------
  437|       |
  438|       |    // General case (neither leftmost column nor bottom row): check if the
  439|       |    // bottom-left block is coded before the current block.
  440|   987k|    const int this_blk_index =
  441|   987k|        ((blk_row_in_sb + 0) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) +
  ------------------
  |  |   43|   987k|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   987k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   987k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  442|   987k|        blk_col_in_sb + 0;
  443|   987k|    const int idx1 = this_blk_index / 8;
  444|   987k|    const int idx2 = this_blk_index % 8;
  445|   987k|    const uint8_t *has_bl_table = get_has_bl_table(partition, bsize);
  446|   987k|    return (has_bl_table[idx1] >> idx2) & 1;
  447|  1.15M|  }
  448|  1.79M|}
reconintra.c:get_has_bl_table:
  368|   987k|                                       BLOCK_SIZE bsize) {
  369|   987k|  const uint8_t *ret = NULL;
  370|       |  // If this is a mixed vertical partition, look up bsize in orders_vert.
  371|   987k|  if (partition == PARTITION_VERT_A || partition == PARTITION_VERT_B) {
  ------------------
  |  Branch (371:7): [True: 39.7k, False: 948k]
  |  Branch (371:40): [True: 38.3k, False: 909k]
  ------------------
  372|  78.1k|    assert(bsize < BLOCK_SIZES);
  373|  78.1k|    ret = has_bl_vert_tables[bsize];
  374|   909k|  } else {
  375|   909k|    ret = has_bl_tables[bsize];
  376|   909k|  }
  377|   987k|  assert(ret);
  378|   987k|  return ret;
  379|   987k|}
reconintra.c:get_intra_edge_filter_type:
  974|  12.3M|static int get_intra_edge_filter_type(const MACROBLOCKD *xd, int plane) {
  975|  12.3M|  const MB_MODE_INFO *above;
  976|  12.3M|  const MB_MODE_INFO *left;
  977|       |
  978|  12.3M|  if (plane == 0) {
  ------------------
  |  Branch (978:7): [True: 6.26M, False: 6.11M]
  ------------------
  979|  6.26M|    above = xd->above_mbmi;
  980|  6.26M|    left = xd->left_mbmi;
  981|  6.26M|  } else {
  982|  6.11M|    above = xd->chroma_above_mbmi;
  983|  6.11M|    left = xd->chroma_left_mbmi;
  984|  6.11M|  }
  985|       |
  986|  12.3M|  return (above && is_smooth(above, plane)) || (left && is_smooth(left, plane));
  ------------------
  |  Branch (986:11): [True: 11.0M, False: 1.32M]
  |  Branch (986:20): [True: 1.32M, False: 9.72M]
  |  Branch (986:49): [True: 9.87M, False: 1.17M]
  |  Branch (986:57): [True: 1.09M, False: 8.78M]
  ------------------
  987|  12.3M|}
reconintra.c:is_smooth:
  958|  20.9M|static int is_smooth(const MB_MODE_INFO *mbmi, int plane) {
  959|  20.9M|  if (plane == 0) {
  ------------------
  |  Branch (959:7): [True: 10.2M, False: 10.6M]
  ------------------
  960|  10.2M|    const PREDICTION_MODE mode = mbmi->mode;
  961|  10.2M|    return (mode == SMOOTH_PRED || mode == SMOOTH_V_PRED ||
  ------------------
  |  Branch (961:13): [True: 736k, False: 9.55M]
  |  Branch (961:36): [True: 336k, False: 9.21M]
  ------------------
  962|  10.2M|            mode == SMOOTH_H_PRED);
  ------------------
  |  Branch (962:13): [True: 357k, False: 8.86M]
  ------------------
  963|  10.6M|  } else {
  964|       |    // uv_mode is not set for inter blocks, so need to explicitly
  965|       |    // detect that case.
  966|  10.6M|    if (is_inter_block(mbmi)) return 0;
  ------------------
  |  Branch (966:9): [True: 1.12M, False: 9.50M]
  ------------------
  967|       |
  968|  9.50M|    const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
  969|  9.50M|    return (uv_mode == UV_SMOOTH_PRED || uv_mode == UV_SMOOTH_V_PRED ||
  ------------------
  |  Branch (969:13): [True: 571k, False: 8.93M]
  |  Branch (969:42): [True: 175k, False: 8.75M]
  ------------------
  970|  9.50M|            uv_mode == UV_SMOOTH_H_PRED);
  ------------------
  |  Branch (970:13): [True: 237k, False: 8.52M]
  ------------------
  971|  10.6M|  }
  972|  20.9M|}
reconintra.c:highbd_build_directional_and_filter_intra_predictors:
 1389|  7.16M|    int bit_depth) {
 1390|  7.16M|  int i;
 1391|  7.16M|  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  ------------------
  |  |   75|  7.16M|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 1392|  7.16M|  const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
  ------------------
  |  |   75|  7.16M|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 1393|  7.16M|  DECLARE_ALIGNED(16, uint16_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
  ------------------
  |  |   19|  7.16M|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 1394|  7.16M|  DECLARE_ALIGNED(16, uint16_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
  ------------------
  |  |   19|  7.16M|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 1395|  7.16M|  uint16_t *const above_row = above_data + 16;
 1396|  7.16M|  uint16_t *const left_col = left_data + 16;
 1397|  7.16M|  const int txwpx = tx_size_wide[tx_size];
 1398|  7.16M|  const int txhpx = tx_size_high[tx_size];
 1399|  7.16M|  int need_left = extend_modes[mode] & NEED_LEFT;
 1400|  7.16M|  int need_above = extend_modes[mode] & NEED_ABOVE;
 1401|  7.16M|  int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
 1402|  7.16M|  const uint16_t *above_ref = ref - ref_stride;
 1403|  7.16M|  const uint16_t *left_ref = ref - 1;
 1404|  7.16M|  const int is_dr_mode = av1_is_directional_mode(mode);
 1405|  7.16M|  const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
 1406|  7.16M|  assert(use_filter_intra || is_dr_mode);
 1407|  7.16M|  const int base = 128 << (bit_depth - 8);
 1408|       |  // The left_data, above_data buffers must be zeroed to fix some intermittent
 1409|       |  // valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4
 1410|       |  // path in av1_highbd_dr_prediction_z2_avx2()) from left_data, above_data are
 1411|       |  // seen to be the potential reason for this issue.
 1412|  7.16M|  aom_memset16(left_data, base + 1, NUM_INTRA_NEIGHBOUR_PIXELS);
  ------------------
  |  |   38|  7.16M|#define NUM_INTRA_NEIGHBOUR_PIXELS (MAX_TX_SIZE * 2 + 32)
  |  |  ------------------
  |  |  |  |  183|  7.16M|#define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |  182|  7.16M|#define MAX_TX_SIZE_LOG2 (6)
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1413|  7.16M|  aom_memset16(above_data, base - 1, NUM_INTRA_NEIGHBOUR_PIXELS);
  ------------------
  |  |   38|  7.16M|#define NUM_INTRA_NEIGHBOUR_PIXELS (MAX_TX_SIZE * 2 + 32)
  |  |  ------------------
  |  |  |  |  183|  7.16M|#define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |  182|  7.16M|#define MAX_TX_SIZE_LOG2 (6)
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1414|       |
 1415|       |  // The default values if ref pixels are not available:
 1416|       |  // base   base-1 base-1 .. base-1 base-1 base-1 base-1 base-1 base-1
 1417|       |  // base+1   A      B  ..     Y      Z
 1418|       |  // base+1   C      D  ..     W      X
 1419|       |  // base+1   E      F  ..     U      V
 1420|       |  // base+1   G      H  ..     S      T      T      T      T      T
 1421|       |
 1422|  7.16M|  if (is_dr_mode) {
  ------------------
  |  Branch (1422:7): [True: 6.41M, False: 748k]
  ------------------
 1423|  6.41M|    if (p_angle <= 90)
  ------------------
  |  Branch (1423:9): [True: 1.51M, False: 4.90M]
  ------------------
 1424|  1.51M|      need_above = 1, need_left = 0, need_above_left = 1;
 1425|  4.90M|    else if (p_angle < 180)
  ------------------
  |  Branch (1425:14): [True: 1.63M, False: 3.26M]
  ------------------
 1426|  1.63M|      need_above = 1, need_left = 1, need_above_left = 1;
 1427|  3.26M|    else
 1428|  3.26M|      need_above = 0, need_left = 1, need_above_left = 1;
 1429|  6.41M|  }
 1430|  7.16M|  if (use_filter_intra) need_left = need_above = need_above_left = 1;
  ------------------
  |  Branch (1430:7): [True: 744k, False: 6.41M]
  ------------------
 1431|       |
 1432|  7.16M|  assert(n_top_px >= 0);
 1433|  7.15M|  assert(n_topright_px >= -1);
 1434|  7.15M|  assert(n_left_px >= 0);
 1435|  7.15M|  assert(n_bottomleft_px >= -1);
 1436|       |
 1437|  7.15M|  if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
  ------------------
  |  Branch (1437:8): [True: 3.26M, False: 3.89M]
  |  Branch (1437:23): [True: 204k, False: 3.06M]
  |  Branch (1437:43): [True: 1.51M, False: 5.44M]
  |  Branch (1437:57): [True: 64.3k, False: 1.44M]
  ------------------
 1438|   268k|    int val;
 1439|   268k|    if (need_left) {
  ------------------
  |  Branch (1439:9): [True: 204k, False: 64.3k]
  ------------------
 1440|   204k|      val = (n_top_px > 0) ? above_ref[0] : base + 1;
  ------------------
  |  Branch (1440:13): [True: 160k, False: 44.0k]
  ------------------
 1441|   204k|    } else {
 1442|  64.3k|      val = (n_left_px > 0) ? left_ref[0] : base - 1;
  ------------------
  |  Branch (1442:13): [True: 54.8k, False: 9.52k]
  ------------------
 1443|  64.3k|    }
 1444|  5.15M|    for (i = 0; i < txhpx; ++i) {
  ------------------
  |  Branch (1444:17): [True: 4.88M, False: 268k]
  ------------------
 1445|  4.88M|      aom_memset16(dst, val, txwpx);
 1446|  4.88M|      dst += dst_stride;
 1447|  4.88M|    }
 1448|   268k|    return;
 1449|   268k|  }
 1450|       |
 1451|       |  // NEED_LEFT
 1452|  6.89M|  if (need_left) {
  ------------------
  |  Branch (1452:7): [True: 5.44M, False: 1.44M]
  ------------------
 1453|  5.44M|    const int num_left_pixels_needed =
 1454|  5.44M|        txhpx + (n_bottomleft_px >= 0 ? txwpx : 0);
  ------------------
  |  Branch (1454:18): [True: 1.14M, False: 4.29M]
  ------------------
 1455|  5.44M|    i = 0;
 1456|  5.44M|    if (n_left_px > 0) {
  ------------------
  |  Branch (1456:9): [True: 5.37M, False: 65.3k]
  ------------------
 1457|  68.2M|      for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
  ------------------
  |  Branch (1457:14): [True: 62.8M, False: 5.37M]
  ------------------
 1458|  5.37M|      if (n_bottomleft_px > 0) {
  ------------------
  |  Branch (1458:11): [True: 403k, False: 4.97M]
  ------------------
 1459|   403k|        assert(i == txhpx);
 1460|  4.65M|        for (; i < txhpx + n_bottomleft_px; i++)
  ------------------
  |  Branch (1460:16): [True: 4.25M, False: 403k]
  ------------------
 1461|  4.25M|          left_col[i] = left_ref[i * ref_stride];
 1462|   403k|      }
 1463|  5.37M|      if (i < num_left_pixels_needed)
  ------------------
  |  Branch (1463:11): [True: 980k, False: 4.39M]
  ------------------
 1464|   980k|        aom_memset16(&left_col[i], left_col[i - 1], num_left_pixels_needed - i);
 1465|  5.37M|    } else if (n_top_px > 0) {
  ------------------
  |  Branch (1465:16): [True: 62.0k, False: 3.22k]
  ------------------
 1466|  62.0k|      aom_memset16(left_col, above_ref[0], num_left_pixels_needed);
 1467|  62.0k|    }
 1468|  5.44M|  }
 1469|       |
 1470|       |  // NEED_ABOVE
 1471|  6.89M|  if (need_above) {
  ------------------
  |  Branch (1471:7): [True: 3.82M, False: 3.06M]
  ------------------
 1472|  3.82M|    const int num_top_pixels_needed = txwpx + (n_topright_px >= 0 ? txhpx : 0);
  ------------------
  |  Branch (1472:48): [True: 765k, False: 3.06M]
  ------------------
 1473|  3.82M|    if (n_top_px > 0) {
  ------------------
  |  Branch (1473:9): [True: 3.76M, False: 61.6k]
  ------------------
 1474|  3.76M|      memcpy(above_row, above_ref, n_top_px * sizeof(above_ref[0]));
 1475|  3.76M|      i = n_top_px;
 1476|  3.76M|      if (n_topright_px > 0) {
  ------------------
  |  Branch (1476:11): [True: 410k, False: 3.35M]
  ------------------
 1477|   410k|        assert(n_top_px == txwpx);
 1478|   410k|        memcpy(above_row + txwpx, above_ref + txwpx,
 1479|   410k|               n_topright_px * sizeof(above_ref[0]));
 1480|   410k|        i += n_topright_px;
 1481|   410k|      }
 1482|  3.76M|      if (i < num_top_pixels_needed)
  ------------------
  |  Branch (1482:11): [True: 470k, False: 3.29M]
  ------------------
 1483|   470k|        aom_memset16(&above_row[i], above_row[i - 1],
 1484|   470k|                     num_top_pixels_needed - i);
 1485|  3.76M|    } else if (n_left_px > 0) {
  ------------------
  |  Branch (1485:16): [True: 58.4k, False: 3.22k]
  ------------------
 1486|  58.4k|      aom_memset16(above_row, left_ref[0], num_top_pixels_needed);
 1487|  58.4k|    }
 1488|  3.82M|  }
 1489|       |
 1490|  6.89M|  if (need_above_left) {
  ------------------
  |  Branch (1490:7): [True: 6.89M, False: 18.4E]
  ------------------
 1491|  6.89M|    if (n_top_px > 0 && n_left_px > 0) {
  ------------------
  |  Branch (1491:9): [True: 6.72M, False: 167k]
  |  Branch (1491:25): [True: 6.61M, False: 104k]
  ------------------
 1492|  6.61M|      above_row[-1] = above_ref[-1];
 1493|  6.61M|    } else if (n_top_px > 0) {
  ------------------
  |  Branch (1493:16): [True: 104k, False: 166k]
  ------------------
 1494|   104k|      above_row[-1] = above_ref[0];
 1495|   166k|    } else if (n_left_px > 0) {
  ------------------
  |  Branch (1495:16): [True: 164k, False: 2.88k]
  ------------------
 1496|   164k|      above_row[-1] = left_ref[0];
 1497|   164k|    } else {
 1498|  2.88k|      above_row[-1] = base;
 1499|  2.88k|    }
 1500|  6.89M|    left_col[-1] = above_row[-1];
 1501|  6.89M|  }
 1502|       |
 1503|  6.89M|  if (use_filter_intra) {
  ------------------
  |  Branch (1503:7): [True: 744k, False: 6.14M]
  ------------------
 1504|   744k|    highbd_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
 1505|   744k|                                  filter_intra_mode, bit_depth);
 1506|   744k|    return;
 1507|   744k|  }
 1508|       |
 1509|  6.14M|  assert(is_dr_mode);
 1510|  6.14M|  int upsample_above = 0;
 1511|  6.14M|  int upsample_left = 0;
 1512|  6.14M|  if (!disable_edge_filter) {
  ------------------
  |  Branch (1512:7): [True: 4.85M, False: 1.29M]
  ------------------
 1513|  4.85M|    const int need_right = p_angle < 90;
 1514|  4.85M|    const int need_bottom = p_angle > 180;
 1515|  4.85M|    if (p_angle != 90 && p_angle != 180) {
  ------------------
  |  Branch (1515:9): [True: 4.31M, False: 543k]
  |  Branch (1515:26): [True: 2.54M, False: 1.77M]
  ------------------
 1516|  2.54M|      assert(need_above_left);
 1517|  2.54M|      const int ab_le = 1;
 1518|  2.54M|      if (need_above && need_left && (txwpx + txhpx >= 24)) {
  ------------------
  |  Branch (1518:11): [True: 1.62M, False: 910k]
  |  Branch (1518:25): [True: 1.07M, False: 555k]
  |  Branch (1518:38): [True: 475k, False: 599k]
  ------------------
 1519|   475k|        highbd_filter_intra_edge_corner(above_row, left_col);
 1520|   475k|      }
 1521|  2.54M|      if (need_above && n_top_px > 0) {
  ------------------
  |  Branch (1521:11): [True: 1.62M, False: 910k]
  |  Branch (1521:25): [True: 1.60M, False: 22.5k]
  ------------------
 1522|  1.60M|        const int strength = intra_edge_filter_strength(
 1523|  1.60M|            txwpx, txhpx, p_angle - 90, intra_edge_filter_type);
 1524|  1.60M|        const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
  ------------------
  |  Branch (1524:46): [True: 555k, False: 1.05M]
  ------------------
 1525|  1.60M|        av1_highbd_filter_intra_edge(above_row - ab_le, n_px, strength);
 1526|  1.60M|      }
 1527|  2.54M|      if (need_left && n_left_px > 0) {
  ------------------
  |  Branch (1527:11): [True: 1.98M, False: 555k]
  |  Branch (1527:24): [True: 1.97M, False: 12.8k]
  ------------------
 1528|  1.97M|        const int strength = intra_edge_filter_strength(
 1529|  1.97M|            txhpx, txwpx, p_angle - 180, intra_edge_filter_type);
 1530|  1.97M|        const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
  ------------------
  |  Branch (1530:47): [True: 910k, False: 1.06M]
  ------------------
 1531|  1.97M|        av1_highbd_filter_intra_edge(left_col - ab_le, n_px, strength);
 1532|  1.97M|      }
 1533|  2.54M|    }
 1534|  4.85M|    upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90,
 1535|  4.85M|                                                 intra_edge_filter_type);
 1536|  4.85M|    if (need_above && upsample_above) {
  ------------------
  |  Branch (1536:9): [True: 2.17M, False: 2.68M]
  |  Branch (1536:23): [True: 301k, False: 1.87M]
  ------------------
 1537|   301k|      const int n_px = txwpx + (need_right ? txhpx : 0);
  ------------------
  |  Branch (1537:33): [True: 138k, False: 163k]
  ------------------
 1538|   301k|      av1_highbd_upsample_intra_edge(above_row, n_px, bit_depth);
 1539|   301k|    }
 1540|  4.85M|    upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180,
 1541|  4.85M|                                                intra_edge_filter_type);
 1542|  4.85M|    if (need_left && upsample_left) {
  ------------------
  |  Branch (1542:9): [True: 3.75M, False: 1.09M]
  |  Branch (1542:22): [True: 545k, False: 3.21M]
  ------------------
 1543|   545k|      const int n_px = txhpx + (need_bottom ? txwpx : 0);
  ------------------
  |  Branch (1543:33): [True: 318k, False: 227k]
  ------------------
 1544|   545k|      av1_highbd_upsample_intra_edge(left_col, n_px, bit_depth);
 1545|   545k|    }
 1546|  4.85M|  }
 1547|  6.14M|  highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col,
 1548|  6.14M|                      upsample_above, upsample_left, p_angle, bit_depth);
 1549|  6.14M|}
reconintra.c:highbd_filter_intra_predictor:
  912|   744k|                                          int bd) {
  913|   744k|  int r, c;
  914|   744k|  uint16_t buffer[33][33];
  915|   744k|  const int bw = tx_size_wide[tx_size];
  916|   744k|  const int bh = tx_size_high[tx_size];
  917|       |
  918|   744k|  assert(bw <= 32 && bh <= 32);
  919|       |
  920|  7.84M|  for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
  ------------------
  |  Branch (920:15): [True: 7.09M, False: 744k]
  ------------------
  921|   744k|  memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(buffer[0][0]));
  922|       |
  923|  4.28M|  for (r = 1; r < bh + 1; r += 2)
  ------------------
  |  Branch (923:15): [True: 3.53M, False: 744k]
  ------------------
  924|  15.6M|    for (c = 1; c < bw + 1; c += 4) {
  ------------------
  |  Branch (924:17): [True: 12.1M, False: 3.53M]
  ------------------
  925|  12.1M|      const uint16_t p0 = buffer[r - 1][c - 1];
  926|  12.1M|      const uint16_t p1 = buffer[r - 1][c];
  927|  12.1M|      const uint16_t p2 = buffer[r - 1][c + 1];
  928|  12.1M|      const uint16_t p3 = buffer[r - 1][c + 2];
  929|  12.1M|      const uint16_t p4 = buffer[r - 1][c + 3];
  930|  12.1M|      const uint16_t p5 = buffer[r][c - 1];
  931|  12.1M|      const uint16_t p6 = buffer[r + 1][c - 1];
  932|   109M|      for (int k = 0; k < 8; ++k) {
  ------------------
  |  Branch (932:23): [True: 96.9M, False: 12.1M]
  ------------------
  933|  96.9M|        int r_offset = k >> 2;
  934|  96.9M|        int c_offset = k & 0x03;
  935|  96.9M|        int pr = av1_filter_intra_taps[mode][k][0] * p0 +
  936|  96.9M|                 av1_filter_intra_taps[mode][k][1] * p1 +
  937|  96.9M|                 av1_filter_intra_taps[mode][k][2] * p2 +
  938|  96.9M|                 av1_filter_intra_taps[mode][k][3] * p3 +
  939|  96.9M|                 av1_filter_intra_taps[mode][k][4] * p4 +
  940|  96.9M|                 av1_filter_intra_taps[mode][k][5] * p5 +
  941|  96.9M|                 av1_filter_intra_taps[mode][k][6] * p6;
  942|       |        // Section 7.11.2.3 specifies the right-hand side of the assignment as
  943|       |        //   Clip1( Round2Signed( pr, INTRA_FILTER_SCALE_BITS ) ).
  944|       |        // Since Clip1() clips a negative value to 0, it is safe to replace
  945|       |        // Round2Signed() with Round2().
  946|  96.9M|        buffer[r + r_offset][c + c_offset] = clip_pixel_highbd(
  947|  96.9M|            ROUND_POWER_OF_TWO(pr, FILTER_INTRA_SCALE_BITS), bd);
  ------------------
  |  |   41|  96.9M|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  948|  96.9M|      }
  949|  12.1M|    }
  950|       |
  951|  7.83M|  for (r = 0; r < bh; ++r) {
  ------------------
  |  Branch (951:15): [True: 7.09M, False: 744k]
  ------------------
  952|  7.09M|    memcpy(dst, &buffer[r + 1][1], bw * sizeof(dst[0]));
  953|  7.09M|    dst += stride;
  954|  7.09M|  }
  955|   744k|}
reconintra.c:highbd_filter_intra_edge_corner:
 1350|   475k|                                            uint16_t *p_left) {
 1351|   475k|  const int kernel[3] = { 5, 6, 5 };
 1352|       |
 1353|   475k|  int s = (p_left[0] * kernel[0]) + (p_above[-1] * kernel[1]) +
 1354|   475k|          (p_above[0] * kernel[2]);
 1355|   475k|  s = (s + 8) >> 4;
 1356|   475k|  p_above[-1] = s;
 1357|   475k|  p_left[-1] = s;
 1358|   475k|}
reconintra.c:intra_edge_filter_strength:
  989|  7.15M|static int intra_edge_filter_strength(int bs0, int bs1, int delta, int type) {
  990|  7.15M|  const int d = abs(delta);
  991|  7.15M|  int strength = 0;
  992|       |
  993|  7.15M|  const int blk_wh = bs0 + bs1;
  994|  7.15M|  if (type == 0) {
  ------------------
  |  Branch (994:7): [True: 5.63M, False: 1.52M]
  ------------------
  995|  5.63M|    if (blk_wh <= 8) {
  ------------------
  |  Branch (995:9): [True: 1.53M, False: 4.09M]
  ------------------
  996|  1.53M|      if (d >= 56) strength = 1;
  ------------------
  |  Branch (996:11): [True: 542k, False: 993k]
  ------------------
  997|  4.09M|    } else if (blk_wh <= 12) {
  ------------------
  |  Branch (997:16): [True: 516k, False: 3.57M]
  ------------------
  998|   516k|      if (d >= 40) strength = 1;
  ------------------
  |  Branch (998:11): [True: 226k, False: 289k]
  ------------------
  999|  3.57M|    } else if (blk_wh <= 16) {
  ------------------
  |  Branch (999:16): [True: 788k, False: 2.78M]
  ------------------
 1000|   788k|      if (d >= 40) strength = 1;
  ------------------
  |  Branch (1000:11): [True: 292k, False: 496k]
  ------------------
 1001|  2.78M|    } else if (blk_wh <= 24) {
  ------------------
  |  Branch (1001:16): [True: 1.03M, False: 1.75M]
  ------------------
 1002|  1.03M|      if (d >= 8) strength = 1;
  ------------------
  |  Branch (1002:11): [True: 813k, False: 216k]
  ------------------
 1003|  1.03M|      if (d >= 16) strength = 2;
  ------------------
  |  Branch (1003:11): [True: 700k, False: 329k]
  ------------------
 1004|  1.03M|      if (d >= 32) strength = 3;
  ------------------
  |  Branch (1004:11): [True: 461k, False: 568k]
  ------------------
 1005|  1.75M|    } else if (blk_wh <= 32) {
  ------------------
  |  Branch (1005:16): [True: 530k, False: 1.22M]
  ------------------
 1006|   530k|      if (d >= 1) strength = 1;
  ------------------
  |  Branch (1006:11): [True: 530k, False: 18.4E]
  ------------------
 1007|   530k|      if (d >= 4) strength = 2;
  ------------------
  |  Branch (1007:11): [True: 455k, False: 74.9k]
  ------------------
 1008|   530k|      if (d >= 32) strength = 3;
  ------------------
  |  Branch (1008:11): [True: 214k, False: 315k]
  ------------------
 1009|  1.22M|    } else {
 1010|  1.22M|      if (d >= 1) strength = 3;
  ------------------
  |  Branch (1010:11): [True: 1.22M, False: 18.4E]
  ------------------
 1011|  1.22M|    }
 1012|  5.63M|  } else {
 1013|  1.52M|    if (blk_wh <= 8) {
  ------------------
  |  Branch (1013:9): [True: 249k, False: 1.27M]
  ------------------
 1014|   249k|      if (d >= 40) strength = 1;
  ------------------
  |  Branch (1014:11): [True: 103k, False: 146k]
  ------------------
 1015|   249k|      if (d >= 64) strength = 2;
  ------------------
  |  Branch (1015:11): [True: 55.7k, False: 194k]
  ------------------
 1016|  1.27M|    } else if (blk_wh <= 16) {
  ------------------
  |  Branch (1016:16): [True: 411k, False: 860k]
  ------------------
 1017|   411k|      if (d >= 20) strength = 1;
  ------------------
  |  Branch (1017:11): [True: 295k, False: 115k]
  ------------------
 1018|   411k|      if (d >= 48) strength = 2;
  ------------------
  |  Branch (1018:11): [True: 124k, False: 286k]
  ------------------
 1019|   860k|    } else if (blk_wh <= 24) {
  ------------------
  |  Branch (1019:16): [True: 334k, False: 526k]
  ------------------
 1020|   334k|      if (d >= 4) strength = 3;
  ------------------
  |  Branch (1020:11): [True: 303k, False: 31.4k]
  ------------------
 1021|   526k|    } else {
 1022|   526k|      if (d >= 1) strength = 3;
  ------------------
  |  Branch (1022:11): [True: 526k, False: 18.4E]
  ------------------
 1023|   526k|    }
 1024|  1.52M|  }
 1025|  7.15M|  return strength;
 1026|  7.15M|}
reconintra.c:highbd_dr_predictor:
  782|  6.14M|                                int upsample_left, int angle, int bd) {
  783|  6.14M|  const int dx = av1_get_dx(angle);
  784|  6.14M|  const int dy = av1_get_dy(angle);
  785|  6.14M|  const int bw = tx_size_wide[tx_size];
  786|  6.14M|  const int bh = tx_size_high[tx_size];
  787|  6.14M|  assert(angle > 0 && angle < 270);
  788|       |
  789|  6.14M|  if (angle > 0 && angle < 90) {
  ------------------
  |  Branch (789:7): [True: 6.14M, False: 18.4E]
  |  Branch (789:20): [True: 765k, False: 5.38M]
  ------------------
  790|   765k|    av1_highbd_dr_prediction_z1(dst, stride, bw, bh, above, left,
  791|   765k|                                upsample_above, dx, dy, bd);
  792|  5.38M|  } else if (angle > 90 && angle < 180) {
  ------------------
  |  Branch (792:14): [True: 4.70M, False: 680k]
  |  Branch (792:28): [True: 1.63M, False: 3.06M]
  ------------------
  793|  1.63M|    av1_highbd_dr_prediction_z2(dst, stride, bw, bh, above, left,
  794|  1.63M|                                upsample_above, upsample_left, dx, dy, bd);
  795|  3.74M|  } else if (angle > 180 && angle < 270) {
  ------------------
  |  Branch (795:14): [True: 1.14M, False: 2.59M]
  |  Branch (795:29): [True: 1.14M, False: 2]
  ------------------
  796|  1.14M|    av1_highbd_dr_prediction_z3(dst, stride, bw, bh, above, left, upsample_left,
  797|  1.14M|                                dx, dy, bd);
  798|  2.59M|  } else if (angle == 90) {
  ------------------
  |  Branch (798:14): [True: 680k, False: 1.91M]
  ------------------
  799|   680k|    pred_high[V_PRED][tx_size](dst, stride, above, left, bd);
  800|  1.91M|  } else if (angle == 180) {
  ------------------
  |  Branch (800:14): [True: 1.91M, False: 18.4E]
  ------------------
  801|  1.91M|    pred_high[H_PRED][tx_size](dst, stride, above, left, bd);
  802|  1.91M|  }
  803|  6.14M|}
reconintra.c:build_directional_and_filter_intra_predictors:
 1088|  5.21M|    int n_left_px, int n_bottomleft_px, int intra_edge_filter_type) {
 1089|  5.21M|  int i;
 1090|  5.21M|  const uint8_t *above_ref = ref - ref_stride;
 1091|  5.21M|  const uint8_t *left_ref = ref - 1;
 1092|  5.21M|  DECLARE_ALIGNED(16, uint8_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
  ------------------
  |  |   19|  5.21M|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 1093|  5.21M|  DECLARE_ALIGNED(16, uint8_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
  ------------------
  |  |   19|  5.21M|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 1094|  5.21M|  uint8_t *const above_row = above_data + 16;
 1095|  5.21M|  uint8_t *const left_col = left_data + 16;
 1096|  5.21M|  const int txwpx = tx_size_wide[tx_size];
 1097|  5.21M|  const int txhpx = tx_size_high[tx_size];
 1098|  5.21M|  int need_left = extend_modes[mode] & NEED_LEFT;
 1099|  5.21M|  int need_above = extend_modes[mode] & NEED_ABOVE;
 1100|  5.21M|  int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
 1101|  5.21M|  const int is_dr_mode = av1_is_directional_mode(mode);
 1102|  5.21M|  const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
 1103|  5.21M|  assert(use_filter_intra || is_dr_mode);
 1104|       |  // The left_data, above_data buffers must be zeroed to fix some intermittent
 1105|       |  // valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4
 1106|       |  // path in av1_dr_prediction_z1_avx2()) from left_data, above_data are seen to
 1107|       |  // be the potential reason for this issue.
 1108|  5.21M|  memset(left_data, 129, NUM_INTRA_NEIGHBOUR_PIXELS);
  ------------------
  |  |   38|  5.21M|#define NUM_INTRA_NEIGHBOUR_PIXELS (MAX_TX_SIZE * 2 + 32)
  |  |  ------------------
  |  |  |  |  183|  5.21M|#define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |  182|  5.21M|#define MAX_TX_SIZE_LOG2 (6)
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1109|  5.21M|  memset(above_data, 127, NUM_INTRA_NEIGHBOUR_PIXELS);
  ------------------
  |  |   38|  5.21M|#define NUM_INTRA_NEIGHBOUR_PIXELS (MAX_TX_SIZE * 2 + 32)
  |  |  ------------------
  |  |  |  |  183|  5.21M|#define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |  182|  5.21M|#define MAX_TX_SIZE_LOG2 (6)
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1110|       |
 1111|       |  // The default values if ref pixels are not available:
 1112|       |  // 128 127 127 .. 127 127 127 127 127 127
 1113|       |  // 129  A   B  ..  Y   Z
 1114|       |  // 129  C   D  ..  W   X
 1115|       |  // 129  E   F  ..  U   V
 1116|       |  // 129  G   H  ..  S   T   T   T   T   T
 1117|       |  // ..
 1118|       |
 1119|  5.21M|  if (is_dr_mode) {
  ------------------
  |  Branch (1119:7): [True: 4.54M, False: 669k]
  ------------------
 1120|  4.54M|    if (p_angle <= 90)
  ------------------
  |  Branch (1120:9): [True: 953k, False: 3.59M]
  ------------------
 1121|   953k|      need_above = 1, need_left = 0, need_above_left = 1;
 1122|  3.59M|    else if (p_angle < 180)
  ------------------
  |  Branch (1122:14): [True: 1.28M, False: 2.30M]
  ------------------
 1123|  1.28M|      need_above = 1, need_left = 1, need_above_left = 1;
 1124|  2.30M|    else
 1125|  2.30M|      need_above = 0, need_left = 1, need_above_left = 1;
 1126|  4.54M|  }
 1127|  5.21M|  if (use_filter_intra) need_left = need_above = need_above_left = 1;
  ------------------
  |  Branch (1127:7): [True: 670k, False: 4.54M]
  ------------------
 1128|       |
 1129|  5.21M|  assert(n_top_px >= 0);
 1130|  5.21M|  assert(n_topright_px >= -1);
 1131|  5.21M|  assert(n_left_px >= 0);
 1132|  5.21M|  assert(n_bottomleft_px >= -1);
 1133|       |
 1134|  5.21M|  if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
  ------------------
  |  Branch (1134:8): [True: 2.30M, False: 2.91M]
  |  Branch (1134:23): [True: 33.8k, False: 2.27M]
  |  Branch (1134:43): [True: 953k, False: 4.23M]
  |  Branch (1134:57): [True: 28.3k, False: 924k]
  ------------------
 1135|  62.1k|    int val;
 1136|  62.1k|    if (need_left) {
  ------------------
  |  Branch (1136:9): [True: 33.8k, False: 28.3k]
  ------------------
 1137|  33.8k|      val = (n_top_px > 0) ? above_ref[0] : 129;
  ------------------
  |  Branch (1137:13): [True: 26.6k, False: 7.17k]
  ------------------
 1138|  33.8k|    } else {
 1139|  28.3k|      val = (n_left_px > 0) ? left_ref[0] : 127;
  ------------------
  |  Branch (1139:13): [True: 20.9k, False: 7.37k]
  ------------------
 1140|  28.3k|    }
 1141|  1.96M|    for (i = 0; i < txhpx; ++i) {
  ------------------
  |  Branch (1141:17): [True: 1.90M, False: 62.1k]
  ------------------
 1142|  1.90M|      memset(dst, val, txwpx);
 1143|  1.90M|      dst += dst_stride;
 1144|  1.90M|    }
 1145|  62.1k|    return;
 1146|  62.1k|  }
 1147|       |
 1148|       |  // NEED_LEFT
 1149|  5.15M|  if (need_left) {
  ------------------
  |  Branch (1149:7): [True: 4.23M, False: 924k]
  ------------------
 1150|  4.23M|    const int num_left_pixels_needed =
 1151|  4.23M|        txhpx + (n_bottomleft_px >= 0 ? txwpx : 0);
  ------------------
  |  Branch (1151:18): [True: 741k, False: 3.48M]
  ------------------
 1152|  4.23M|    i = 0;
 1153|  4.23M|    if (n_left_px > 0) {
  ------------------
  |  Branch (1153:9): [True: 4.17M, False: 53.6k]
  ------------------
 1154|  55.1M|      for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
  ------------------
  |  Branch (1154:14): [True: 51.0M, False: 4.17M]
  ------------------
 1155|  4.17M|      if (n_bottomleft_px > 0) {
  ------------------
  |  Branch (1155:11): [True: 296k, False: 3.88M]
  ------------------
 1156|   296k|        assert(i == txhpx);
 1157|  3.67M|        for (; i < txhpx + n_bottomleft_px; i++)
  ------------------
  |  Branch (1157:16): [True: 3.38M, False: 296k]
  ------------------
 1158|  3.38M|          left_col[i] = left_ref[i * ref_stride];
 1159|   296k|      }
 1160|  4.17M|      if (i < num_left_pixels_needed)
  ------------------
  |  Branch (1160:11): [True: 623k, False: 3.55M]
  ------------------
 1161|   623k|        memset(&left_col[i], left_col[i - 1], num_left_pixels_needed - i);
 1162|  4.17M|    } else if (n_top_px > 0) {
  ------------------
  |  Branch (1162:16): [True: 36.4k, False: 17.2k]
  ------------------
 1163|  36.4k|      memset(left_col, above_ref[0], num_left_pixels_needed);
 1164|  36.4k|    }
 1165|  4.23M|  }
 1166|       |
 1167|       |  // NEED_ABOVE
 1168|  5.15M|  if (need_above) {
  ------------------
  |  Branch (1168:7): [True: 2.88M, False: 2.27M]
  ------------------
 1169|  2.88M|    const int num_top_pixels_needed = txwpx + (n_topright_px >= 0 ? txhpx : 0);
  ------------------
  |  Branch (1169:48): [True: 469k, False: 2.41M]
  ------------------
 1170|  2.88M|    if (n_top_px > 0) {
  ------------------
  |  Branch (1170:9): [True: 2.82M, False: 59.8k]
  ------------------
 1171|  2.82M|      memcpy(above_row, above_ref, n_top_px);
 1172|  2.82M|      i = n_top_px;
 1173|  2.82M|      if (n_topright_px > 0) {
  ------------------
  |  Branch (1173:11): [True: 288k, False: 2.53M]
  ------------------
 1174|   288k|        assert(n_top_px == txwpx);
 1175|   288k|        memcpy(above_row + txwpx, above_ref + txwpx, n_topright_px);
 1176|   288k|        i += n_topright_px;
 1177|   288k|      }
 1178|  2.82M|      if (i < num_top_pixels_needed)
  ------------------
  |  Branch (1178:11): [True: 254k, False: 2.56M]
  ------------------
 1179|   254k|        memset(&above_row[i], above_row[i - 1], num_top_pixels_needed - i);
 1180|  2.82M|    } else if (n_left_px > 0) {
  ------------------
  |  Branch (1180:16): [True: 42.6k, False: 17.2k]
  ------------------
 1181|  42.6k|      memset(above_row, left_ref[0], num_top_pixels_needed);
 1182|  42.6k|    }
 1183|  2.88M|  }
 1184|       |
 1185|  5.15M|  if (need_above_left) {
  ------------------
  |  Branch (1185:7): [True: 5.15M, False: 58]
  ------------------
 1186|  5.15M|    if (n_top_px > 0 && n_left_px > 0) {
  ------------------
  |  Branch (1186:9): [True: 5.04M, False: 114k]
  |  Branch (1186:25): [True: 4.98M, False: 59.2k]
  ------------------
 1187|  4.98M|      above_row[-1] = above_ref[-1];
 1188|  4.98M|    } else if (n_top_px > 0) {
  ------------------
  |  Branch (1188:16): [True: 59.2k, False: 114k]
  ------------------
 1189|  59.2k|      above_row[-1] = above_ref[0];
 1190|   114k|    } else if (n_left_px > 0) {
  ------------------
  |  Branch (1190:16): [True: 98.2k, False: 16.6k]
  ------------------
 1191|  98.2k|      above_row[-1] = left_ref[0];
 1192|  98.2k|    } else {
 1193|  16.6k|      above_row[-1] = 128;
 1194|  16.6k|    }
 1195|  5.15M|    left_col[-1] = above_row[-1];
 1196|  5.15M|  }
 1197|       |
 1198|  5.15M|  if (use_filter_intra) {
  ------------------
  |  Branch (1198:7): [True: 670k, False: 4.48M]
  ------------------
 1199|   670k|    av1_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
 1200|   670k|                               filter_intra_mode);
 1201|   670k|    return;
 1202|   670k|  }
 1203|       |
 1204|  4.48M|  assert(is_dr_mode);
 1205|  4.48M|  int upsample_above = 0;
 1206|  4.48M|  int upsample_left = 0;
 1207|  4.48M|  if (!disable_edge_filter) {
  ------------------
  |  Branch (1207:7): [True: 4.31M, False: 168k]
  ------------------
 1208|  4.31M|    const int need_right = p_angle < 90;
 1209|  4.31M|    const int need_bottom = p_angle > 180;
 1210|  4.31M|    if (p_angle != 90 && p_angle != 180) {
  ------------------
  |  Branch (1210:9): [True: 3.90M, False: 416k]
  |  Branch (1210:26): [True: 2.38M, False: 1.51M]
  ------------------
 1211|  2.38M|      assert(need_above_left);
 1212|  2.38M|      const int ab_le = 1;
 1213|  2.38M|      if (need_above && need_left && (txwpx + txhpx >= 24)) {
  ------------------
  |  Branch (1213:11): [True: 1.67M, False: 708k]
  |  Branch (1213:25): [True: 1.23M, False: 440k]
  |  Branch (1213:38): [True: 416k, False: 821k]
  ------------------
 1214|   416k|        filter_intra_edge_corner(above_row, left_col);
 1215|   416k|      }
 1216|  2.38M|      if (need_above && n_top_px > 0) {
  ------------------
  |  Branch (1216:11): [True: 1.67M, False: 708k]
  |  Branch (1216:25): [True: 1.64M, False: 31.1k]
  ------------------
 1217|  1.64M|        const int strength = intra_edge_filter_strength(
 1218|  1.64M|            txwpx, txhpx, p_angle - 90, intra_edge_filter_type);
 1219|  1.64M|        const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
  ------------------
  |  Branch (1219:46): [True: 440k, False: 1.20M]
  ------------------
 1220|  1.64M|        av1_filter_intra_edge(above_row - ab_le, n_px, strength);
 1221|  1.64M|      }
 1222|  2.38M|      if (need_left && n_left_px > 0) {
  ------------------
  |  Branch (1222:11): [True: 1.94M, False: 440k]
  |  Branch (1222:24): [True: 1.92M, False: 19.2k]
  ------------------
 1223|  1.92M|        const int strength = intra_edge_filter_strength(
 1224|  1.92M|            txhpx, txwpx, p_angle - 180, intra_edge_filter_type);
 1225|  1.92M|        const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
  ------------------
  |  Branch (1225:47): [True: 708k, False: 1.21M]
  ------------------
 1226|  1.92M|        av1_filter_intra_edge(left_col - ab_le, n_px, strength);
 1227|  1.92M|      }
 1228|  2.38M|    }
 1229|  4.31M|    upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90,
 1230|  4.31M|                                                 intra_edge_filter_type);
 1231|  4.31M|    if (need_above && upsample_above) {
  ------------------
  |  Branch (1231:9): [True: 2.09M, False: 2.22M]
  |  Branch (1231:23): [True: 241k, False: 1.85M]
  ------------------
 1232|   241k|      const int n_px = txwpx + (need_right ? txhpx : 0);
  ------------------
  |  Branch (1232:33): [True: 123k, False: 117k]
  ------------------
 1233|   241k|      av1_upsample_intra_edge(above_row, n_px);
 1234|   241k|    }
 1235|  4.31M|    upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180,
 1236|  4.31M|                                                intra_edge_filter_type);
 1237|  4.31M|    if (need_left && upsample_left) {
  ------------------
  |  Branch (1237:9): [True: 3.46M, False: 856k]
  |  Branch (1237:22): [True: 735k, False: 2.72M]
  ------------------
 1238|   735k|      const int n_px = txhpx + (need_bottom ? txwpx : 0);
  ------------------
  |  Branch (1238:33): [True: 218k, False: 517k]
  ------------------
 1239|   735k|      av1_upsample_intra_edge(left_col, n_px);
 1240|   735k|    }
 1241|  4.31M|  }
 1242|  4.48M|  dr_predictor(dst, dst_stride, tx_size, above_row, left_col, upsample_above,
 1243|  4.48M|               upsample_left, p_angle);
 1244|  4.48M|}
reconintra.c:filter_intra_edge_corner:
 1051|   416k|static void filter_intra_edge_corner(uint8_t *p_above, uint8_t *p_left) {
 1052|   416k|  const int kernel[3] = { 5, 6, 5 };
 1053|       |
 1054|   416k|  int s = (p_left[0] * kernel[0]) + (p_above[-1] * kernel[1]) +
 1055|   416k|          (p_above[0] * kernel[2]);
 1056|   416k|  s = (s + 8) >> 4;
 1057|   416k|  p_above[-1] = s;
 1058|   416k|  p_left[-1] = s;
 1059|   416k|}
reconintra.c:dr_predictor:
  642|  4.48M|                         int upsample_above, int upsample_left, int angle) {
  643|  4.48M|  const int dx = av1_get_dx(angle);
  644|  4.48M|  const int dy = av1_get_dy(angle);
  645|  4.48M|  const int bw = tx_size_wide[tx_size];
  646|  4.48M|  const int bh = tx_size_high[tx_size];
  647|  4.48M|  assert(angle > 0 && angle < 270);
  648|       |
  649|  4.48M|  if (angle > 0 && angle < 90) {
  ------------------
  |  Branch (649:7): [True: 4.48M, False: 18.4E]
  |  Branch (649:20): [True: 469k, False: 4.01M]
  ------------------
  650|   469k|    av1_dr_prediction_z1(dst, stride, bw, bh, above, left, upsample_above, dx,
  651|   469k|                         dy);
  652|  4.01M|  } else if (angle > 90 && angle < 180) {
  ------------------
  |  Branch (652:14): [True: 3.56M, False: 455k]
  |  Branch (652:28): [True: 1.28M, False: 2.27M]
  ------------------
  653|  1.28M|    av1_dr_prediction_z2(dst, stride, bw, bh, above, left, upsample_above,
  654|  1.28M|                         upsample_left, dx, dy);
  655|  2.72M|  } else if (angle > 180 && angle < 270) {
  ------------------
  |  Branch (655:14): [True: 741k, False: 1.98M]
  |  Branch (655:29): [True: 741k, False: 18.4E]
  ------------------
  656|   741k|    av1_dr_prediction_z3(dst, stride, bw, bh, above, left, upsample_left, dx,
  657|   741k|                         dy);
  658|  1.98M|  } else if (angle == 90) {
  ------------------
  |  Branch (658:14): [True: 455k, False: 1.53M]
  ------------------
  659|   455k|    pred[V_PRED][tx_size](dst, stride, above, left);
  660|  1.53M|  } else if (angle == 180) {
  ------------------
  |  Branch (660:14): [True: 1.53M, False: 18.4E]
  ------------------
  661|  1.53M|    pred[H_PRED][tx_size](dst, stride, above, left);
  662|  1.53M|  }
  663|  4.48M|}
reconintra.c:init_intra_predictors_internal:
  464|      1|static void init_intra_predictors_internal(void) {
  465|      1|  assert(NELEMENTS(mode_to_angle_map) == INTRA_MODES);
  466|       |
  467|       |#if CONFIG_REALTIME_ONLY && !CONFIG_AV1_DECODER
  468|       |#define INIT_RECTANGULAR(p, type)             \
  469|       |  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  470|       |  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  471|       |  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  472|       |  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  473|       |  p[TX_16X32] = aom_##type##_predictor_16x32; \
  474|       |  p[TX_32X16] = aom_##type##_predictor_32x16; \
  475|       |  p[TX_32X64] = aom_##type##_predictor_32x64; \
  476|       |  p[TX_64X32] = aom_##type##_predictor_64x32;
  477|       |#else
  478|      1|#define INIT_RECTANGULAR(p, type)             \
  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  493|      1|#endif  // CONFIG_REALTIME_ONLY && !CONFIG_AV1_DECODER
  494|       |
  495|      1|#define INIT_NO_4X4(p, type)                  \
  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  500|      1|  INIT_RECTANGULAR(p, type)
  501|       |
  502|      1|#define INIT_ALL_SIZES(p, type)           \
  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  504|      1|  INIT_NO_4X4(p, type)
  505|       |
  506|      1|  INIT_ALL_SIZES(pred[V_PRED], v)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  507|      1|  INIT_ALL_SIZES(pred[H_PRED], h)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  508|      1|  INIT_ALL_SIZES(pred[PAETH_PRED], paeth)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  509|      1|  INIT_ALL_SIZES(pred[SMOOTH_PRED], smooth)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  510|      1|  INIT_ALL_SIZES(pred[SMOOTH_V_PRED], smooth_v)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  511|      1|  INIT_ALL_SIZES(pred[SMOOTH_H_PRED], smooth_h)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  512|      1|  INIT_ALL_SIZES(dc_pred[0][0], dc_128)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  513|      1|  INIT_ALL_SIZES(dc_pred[0][1], dc_top)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  514|      1|  INIT_ALL_SIZES(dc_pred[1][0], dc_left)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  515|      1|  INIT_ALL_SIZES(dc_pred[1][1], dc)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  516|      1|#if CONFIG_AV1_HIGHBITDEPTH
  517|      1|  INIT_ALL_SIZES(pred_high[V_PRED], highbd_v)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  518|      1|  INIT_ALL_SIZES(pred_high[H_PRED], highbd_h)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  519|      1|  INIT_ALL_SIZES(pred_high[PAETH_PRED], highbd_paeth)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  520|      1|  INIT_ALL_SIZES(pred_high[SMOOTH_PRED], highbd_smooth)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  521|      1|  INIT_ALL_SIZES(pred_high[SMOOTH_V_PRED], highbd_smooth_v)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  522|      1|  INIT_ALL_SIZES(pred_high[SMOOTH_H_PRED], highbd_smooth_h)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  523|      1|  INIT_ALL_SIZES(dc_pred_high[0][0], highbd_dc_128)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  524|      1|  INIT_ALL_SIZES(dc_pred_high[0][1], highbd_dc_top)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  525|      1|  INIT_ALL_SIZES(dc_pred_high[1][0], highbd_dc_left)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  526|      1|  INIT_ALL_SIZES(dc_pred_high[1][1], highbd_dc)
  ------------------
  |  |  503|      1|  p[TX_4X4] = aom_##type##_predictor_4x4; \
  |  |  504|      1|  INIT_NO_4X4(p, type)
  |  |  ------------------
  |  |  |  |  496|      1|  p[TX_8X8] = aom_##type##_predictor_8x8;     \
  |  |  |  |  497|      1|  p[TX_16X16] = aom_##type##_predictor_16x16; \
  |  |  |  |  498|      1|  p[TX_32X32] = aom_##type##_predictor_32x32; \
  |  |  |  |  499|      1|  p[TX_64X64] = aom_##type##_predictor_64x64; \
  |  |  |  |  500|      1|  INIT_RECTANGULAR(p, type)
  |  |  |  |  ------------------
  |  |  |  |  |  |  479|      1|  p[TX_4X8] = aom_##type##_predictor_4x8;     \
  |  |  |  |  |  |  480|      1|  p[TX_8X4] = aom_##type##_predictor_8x4;     \
  |  |  |  |  |  |  481|      1|  p[TX_8X16] = aom_##type##_predictor_8x16;   \
  |  |  |  |  |  |  482|      1|  p[TX_16X8] = aom_##type##_predictor_16x8;   \
  |  |  |  |  |  |  483|      1|  p[TX_16X32] = aom_##type##_predictor_16x32; \
  |  |  |  |  |  |  484|      1|  p[TX_32X16] = aom_##type##_predictor_32x16; \
  |  |  |  |  |  |  485|      1|  p[TX_32X64] = aom_##type##_predictor_32x64; \
  |  |  |  |  |  |  486|      1|  p[TX_64X32] = aom_##type##_predictor_64x32; \
  |  |  |  |  |  |  487|      1|  p[TX_4X16] = aom_##type##_predictor_4x16;   \
  |  |  |  |  |  |  488|      1|  p[TX_16X4] = aom_##type##_predictor_16x4;   \
  |  |  |  |  |  |  489|      1|  p[TX_8X32] = aom_##type##_predictor_8x32;   \
  |  |  |  |  |  |  490|      1|  p[TX_32X8] = aom_##type##_predictor_32x8;   \
  |  |  |  |  |  |  491|      1|  p[TX_16X64] = aom_##type##_predictor_16x64; \
  |  |  |  |  |  |  492|      1|  p[TX_64X16] = aom_##type##_predictor_64x16;
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  527|      1|#endif
  528|      1|#undef intra_pred_allsizes
  529|      1|}

decodeframe.c:av1_allow_intrabc:
   63|   176k|static inline int av1_allow_intrabc(const AV1_COMMON *const cm) {
   64|   176k|  return frame_is_intra_only(cm) && cm->features.allow_screen_content_tools &&
  ------------------
  |  Branch (64:10): [True: 133k, False: 43.3k]
  |  Branch (64:37): [True: 90.0k, False: 43.1k]
  ------------------
   65|   176k|         cm->features.allow_intrabc;
  ------------------
  |  Branch (65:10): [True: 42.1k, False: 47.8k]
  ------------------
   66|   176k|}
decodemv.c:av1_allow_intrabc:
   63|  9.25M|static inline int av1_allow_intrabc(const AV1_COMMON *const cm) {
   64|  9.25M|  return frame_is_intra_only(cm) && cm->features.allow_screen_content_tools &&
  ------------------
  |  Branch (64:10): [True: 9.25M, False: 57]
  |  Branch (64:37): [True: 2.88M, False: 6.36M]
  ------------------
   65|  9.25M|         cm->features.allow_intrabc;
  ------------------
  |  Branch (65:10): [True: 1.90M, False: 984k]
  ------------------
   66|  9.25M|}
decodemv.c:av1_use_angle_delta:
   59|  10.9M|static inline int av1_use_angle_delta(BLOCK_SIZE bsize) {
   60|  10.9M|  return bsize >= BLOCK_8X8;
   61|  10.9M|}
decodemv.c:av1_is_directional_mode:
   51|  18.3M|static inline int av1_is_directional_mode(PREDICTION_MODE mode) {
   52|  18.3M|  return mode >= V_PRED && mode <= D67_PRED;
  ------------------
  |  Branch (52:10): [True: 10.1M, False: 8.16M]
  |  Branch (52:28): [True: 5.25M, False: 4.91M]
  ------------------
   53|  18.3M|}
decodemv.c:av1_filter_intra_allowed:
   76|  10.9M|                                           const MB_MODE_INFO *mbmi) {
   77|  10.9M|  return mbmi->mode == DC_PRED &&
  ------------------
  |  Branch (77:10): [True: 4.43M, False: 6.56M]
  ------------------
   78|  10.9M|         mbmi->palette_mode_info.palette_size[0] == 0 &&
  ------------------
  |  Branch (78:10): [True: 4.34M, False: 88.6k]
  ------------------
   79|  10.9M|         av1_filter_intra_allowed_bsize(cm, mbmi->bsize);
  ------------------
  |  Branch (79:10): [True: 2.15M, False: 2.18M]
  ------------------
   80|  10.9M|}
decodemv.c:av1_filter_intra_allowed_bsize:
   69|  4.34M|                                                 BLOCK_SIZE bs) {
   70|  4.34M|  if (!cm->seq_params->enable_filter_intra || bs == BLOCK_INVALID) return 0;
  ------------------
  |  Branch (70:7): [True: 1.85M, False: 2.49M]
  |  Branch (70:47): [True: 1, False: 2.49M]
  ------------------
   71|       |
   72|  2.49M|  return block_size_wide[bs] <= 32 && block_size_high[bs] <= 32;
  ------------------
  |  Branch (72:10): [True: 2.18M, False: 311k]
  |  Branch (72:39): [True: 2.15M, False: 25.3k]
  ------------------
   73|  4.34M|}
reconintra.c:av1_is_directional_mode:
   51|   125M|static inline int av1_is_directional_mode(PREDICTION_MODE mode) {
   52|   125M|  return mode >= V_PRED && mode <= D67_PRED;
  ------------------
  |  Branch (52:10): [True: 32.3M, False: 93.2M]
  |  Branch (52:28): [True: 21.9M, False: 10.4M]
  ------------------
   53|   125M|}
reconintra.c:av1_use_intra_edge_upsample:
  149|  18.3M|                                              int type) {
  150|  18.3M|  const int d = abs(delta);
  151|  18.3M|  const int blk_wh = bs0 + bs1;
  152|  18.3M|  if (d == 0 || d >= 40) return 0;
  ------------------
  |  Branch (152:7): [True: 4.24M, False: 14.0M]
  |  Branch (152:17): [True: 9.59M, False: 4.50M]
  ------------------
  153|  4.49M|  return type ? (blk_wh <= 8) : (blk_wh <= 16);
  ------------------
  |  Branch (153:10): [True: 966k, False: 3.53M]
  ------------------
  154|  18.3M|}
reconintra.c:av1_get_dx:
  122|  10.6M|static inline int av1_get_dx(int angle) {
  123|  10.6M|  if (angle > 0 && angle < 90) {
  ------------------
  |  Branch (123:7): [True: 10.6M, False: 18.4E]
  |  Branch (123:20): [True: 1.23M, False: 9.40M]
  ------------------
  124|  1.23M|    return dr_intra_derivative[angle];
  125|  9.40M|  } else if (angle > 90 && angle < 180) {
  ------------------
  |  Branch (125:14): [True: 8.26M, False: 1.13M]
  |  Branch (125:28): [True: 2.92M, False: 5.33M]
  ------------------
  126|  2.92M|    return dr_intra_derivative[180 - angle];
  127|  6.47M|  } else {
  128|       |    // In this case, we are not really going to use dx. We may return any value.
  129|  6.47M|    return 1;
  130|  6.47M|  }
  131|  10.6M|}
reconintra.c:av1_get_dy:
  137|  10.6M|static inline int av1_get_dy(int angle) {
  138|  10.6M|  if (angle > 90 && angle < 180) {
  ------------------
  |  Branch (138:7): [True: 8.26M, False: 2.37M]
  |  Branch (138:21): [True: 2.92M, False: 5.33M]
  ------------------
  139|  2.92M|    return dr_intra_derivative[angle - 90];
  140|  7.70M|  } else if (angle > 180 && angle < 270) {
  ------------------
  |  Branch (140:14): [True: 1.88M, False: 5.82M]
  |  Branch (140:29): [True: 1.88M, False: 0]
  ------------------
  141|  1.88M|    return dr_intra_derivative[270 - angle];
  142|  5.82M|  } else {
  143|       |    // In this case, we are not really going to use dy. We may return any value.
  144|  5.82M|    return 1;
  145|  5.82M|  }
  146|  10.6M|}
thread_common.c:av1_allow_intrabc:
   63|  91.5k|static inline int av1_allow_intrabc(const AV1_COMMON *const cm) {
   64|  91.5k|  return frame_is_intra_only(cm) && cm->features.allow_screen_content_tools &&
  ------------------
  |  Branch (64:10): [True: 45.4k, False: 46.0k]
  |  Branch (64:37): [True: 21.3k, False: 24.0k]
  ------------------
   65|  91.5k|         cm->features.allow_intrabc;
  ------------------
  |  Branch (65:10): [True: 7.06k, False: 14.2k]
  ------------------
   66|  91.5k|}

av1_get_upscale_convolve_step:
  324|  41.5k|int32_t av1_get_upscale_convolve_step(int in_length, int out_length) {
  325|  41.5k|  return ((in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) / out_length;
  ------------------
  |  |   36|  41.5k|#define RS_SCALE_SUBPEL_BITS 14
  ------------------
  326|  41.5k|}
av1_upscale_normative_rows:
 1121|  41.5k|                                int plane, int rows) {
 1122|  41.5k|  const int is_uv = (plane > 0);
 1123|  41.5k|  const int ss_x = is_uv && cm->seq_params->subsampling_x;
  ------------------
  |  Branch (1123:20): [True: 19.3k, False: 22.2k]
  |  Branch (1123:29): [True: 16.2k, False: 3.11k]
  ------------------
 1124|  41.5k|  const int downscaled_plane_width = ROUND_POWER_OF_TWO(cm->width, ss_x);
  ------------------
  |  |   41|  41.5k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
 1125|  41.5k|  const int upscaled_plane_width =
 1126|  41.5k|      ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
  ------------------
  |  |   41|  41.5k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
 1127|  41.5k|  const int superres_denom = cm->superres_scale_denominator;
 1128|       |
 1129|  41.5k|  TileInfo tile_col;
 1130|  41.5k|  const int32_t x_step_qn = av1_get_upscale_convolve_step(
 1131|  41.5k|      downscaled_plane_width, upscaled_plane_width);
 1132|  41.5k|  int32_t x0_qn = get_upscale_convolve_x0(downscaled_plane_width,
 1133|  41.5k|                                          upscaled_plane_width, x_step_qn);
 1134|       |
 1135|  87.6k|  for (int j = 0; j < cm->tiles.cols; j++) {
  ------------------
  |  Branch (1135:19): [True: 46.0k, False: 41.5k]
  ------------------
 1136|  46.0k|    av1_tile_set_col(&tile_col, cm, j);
 1137|       |    // Determine the limits of this tile column in both the source
 1138|       |    // and destination images.
 1139|       |    // Note: The actual location which we start sampling from is
 1140|       |    // (downscaled_x0 - 1 + (x0_qn/2^14)), and this quantity increases
 1141|       |    // by exactly dst_width * (x_step_qn/2^14) pixels each iteration.
 1142|  46.0k|    const int downscaled_x0 = tile_col.mi_col_start << (MI_SIZE_LOG2 - ss_x);
  ------------------
  |  |   39|  46.0k|#define MI_SIZE_LOG2 2
  ------------------
 1143|  46.0k|    const int downscaled_x1 = tile_col.mi_col_end << (MI_SIZE_LOG2 - ss_x);
  ------------------
  |  |   39|  46.0k|#define MI_SIZE_LOG2 2
  ------------------
 1144|  46.0k|    const int src_width = downscaled_x1 - downscaled_x0;
 1145|       |
 1146|  46.0k|    const int upscaled_x0 = (downscaled_x0 * superres_denom) / SCALE_NUMERATOR;
  ------------------
  |  |   22|  46.0k|#define SCALE_NUMERATOR 8
  ------------------
 1147|  46.0k|    int upscaled_x1;
 1148|  46.0k|    if (j == cm->tiles.cols - 1) {
  ------------------
  |  Branch (1148:9): [True: 41.5k, False: 4.44k]
  ------------------
 1149|       |      // Note that we can't just use AOMMIN here - due to rounding,
 1150|       |      // (downscaled_x1 * superres_denom) / SCALE_NUMERATOR may be less than
 1151|       |      // upscaled_plane_width.
 1152|  41.5k|      upscaled_x1 = upscaled_plane_width;
 1153|  41.5k|    } else {
 1154|  4.44k|      upscaled_x1 = (downscaled_x1 * superres_denom) / SCALE_NUMERATOR;
  ------------------
  |  |   22|  4.44k|#define SCALE_NUMERATOR 8
  ------------------
 1155|  4.44k|    }
 1156|       |
 1157|  46.0k|    const uint8_t *const src_ptr = src + downscaled_x0;
 1158|  46.0k|    uint8_t *const dst_ptr = dst + upscaled_x0;
 1159|  46.0k|    const int dst_width = upscaled_x1 - upscaled_x0;
 1160|       |
 1161|  46.0k|    const int pad_left = (j == 0);
 1162|  46.0k|    const int pad_right = (j == cm->tiles.cols - 1);
 1163|       |
 1164|  46.0k|    bool success;
 1165|  46.0k|#if CONFIG_AV1_HIGHBITDEPTH
 1166|  46.0k|    if (cm->seq_params->use_highbitdepth)
  ------------------
  |  Branch (1166:9): [True: 23.8k, False: 22.2k]
  ------------------
 1167|  23.8k|      success = highbd_upscale_normative_rect(
 1168|  23.8k|          src_ptr, rows, src_width, src_stride, dst_ptr, rows, dst_width,
 1169|  23.8k|          dst_stride, x_step_qn, x0_qn, pad_left, pad_right,
 1170|  23.8k|          cm->seq_params->bit_depth);
 1171|  22.2k|    else
 1172|  22.2k|      success = upscale_normative_rect(src_ptr, rows, src_width, src_stride,
 1173|  22.2k|                                       dst_ptr, rows, dst_width, dst_stride,
 1174|  22.2k|                                       x_step_qn, x0_qn, pad_left, pad_right);
 1175|       |#else
 1176|       |    success = upscale_normative_rect(src_ptr, rows, src_width, src_stride,
 1177|       |                                     dst_ptr, rows, dst_width, dst_stride,
 1178|       |                                     x_step_qn, x0_qn, pad_left, pad_right);
 1179|       |#endif
 1180|  46.0k|    if (!success) {
  ------------------
  |  Branch (1180:9): [True: 0, False: 46.0k]
  ------------------
 1181|      0|      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
 1182|      0|                         "Error upscaling frame");
 1183|      0|    }
 1184|       |    // Update the fractional pixel offset to prepare for the next tile column.
 1185|  46.0k|    x0_qn += (dst_width * x_step_qn) - (src_width << RS_SCALE_SUBPEL_BITS);
  ------------------
  |  |   36|  46.0k|#define RS_SCALE_SUBPEL_BITS 14
  ------------------
 1186|  46.0k|  }
 1187|  41.5k|}
av1_calculate_scaled_superres_size:
 1297|  37.8k|                                        int superres_denom) {
 1298|  37.8k|  (void)height;
 1299|  37.8k|  calculate_scaled_size_helper(width, superres_denom);
 1300|  37.8k|}
av1_superres_upscale:
 1318|  9.88k|                          bool alloc_pyramid) {
 1319|  9.88k|  const int num_planes = av1_num_planes(cm);
 1320|  9.88k|  if (!av1_superres_scaled(cm)) return;
  ------------------
  |  Branch (1320:7): [True: 0, False: 9.88k]
  ------------------
 1321|  9.88k|  const SequenceHeader *const seq_params = cm->seq_params;
 1322|  9.88k|  const int byte_alignment = cm->features.byte_alignment;
 1323|       |
 1324|  9.88k|  YV12_BUFFER_CONFIG copy_buffer;
 1325|  9.88k|  memset(&copy_buffer, 0, sizeof(copy_buffer));
 1326|       |
 1327|  9.88k|  YV12_BUFFER_CONFIG *const frame_to_show = &cm->cur_frame->buf;
 1328|       |
 1329|  9.88k|  const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, 3);
  ------------------
  |  |   69|  9.88k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
 1330|  9.88k|  if (aom_alloc_frame_buffer(
  ------------------
  |  Branch (1330:7): [True: 0, False: 9.88k]
  ------------------
 1331|  9.88k|          &copy_buffer, aligned_width, cm->height, seq_params->subsampling_x,
 1332|  9.88k|          seq_params->subsampling_y, seq_params->use_highbitdepth,
 1333|  9.88k|          AOM_BORDER_IN_PIXELS, byte_alignment, false, 0))
  ------------------
  |  |   32|  9.88k|#define AOM_BORDER_IN_PIXELS 288
  ------------------
 1334|      0|    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
 1335|      0|                       "Failed to allocate copy buffer for superres upscaling");
 1336|       |
 1337|       |  // Copy function assumes the frames are the same size.
 1338|       |  // Note that it does not copy YV12_BUFFER_CONFIG config data.
 1339|  9.88k|  aom_yv12_copy_frame(frame_to_show, &copy_buffer, num_planes);
  ------------------
  |  |   37|  9.88k|#define aom_yv12_copy_frame aom_yv12_copy_frame_c
  ------------------
 1340|       |
 1341|  9.88k|  assert(copy_buffer.y_crop_width == aligned_width);
 1342|  9.88k|  assert(copy_buffer.y_crop_height == cm->height);
 1343|       |
 1344|       |  // Realloc the current frame buffer at a higher resolution in place.
 1345|  9.88k|  if (pool != NULL) {
  ------------------
  |  Branch (1345:7): [True: 9.88k, False: 0]
  ------------------
 1346|       |    // Use callbacks if on the decoder.
 1347|  9.88k|    aom_codec_frame_buffer_t *fb = &cm->cur_frame->raw_frame_buffer;
 1348|  9.88k|    aom_release_frame_buffer_cb_fn_t release_fb_cb = pool->release_fb_cb;
 1349|  9.88k|    aom_get_frame_buffer_cb_fn_t cb = pool->get_fb_cb;
 1350|  9.88k|    void *cb_priv = pool->cb_priv;
 1351|       |
 1352|  9.88k|    lock_buffer_pool(pool);
 1353|       |    // Realloc with callback does not release the frame buffer - release first.
 1354|  9.88k|    if (release_fb_cb(cb_priv, fb)) {
  ------------------
  |  Branch (1354:9): [True: 0, False: 9.88k]
  ------------------
 1355|      0|      unlock_buffer_pool(pool);
 1356|      0|      aom_internal_error(
 1357|      0|          cm->error, AOM_CODEC_MEM_ERROR,
 1358|      0|          "Failed to free current frame buffer before superres upscaling");
 1359|      0|    }
 1360|       |    // aom_realloc_frame_buffer() leaves config data for frame_to_show intact
 1361|  9.88k|    if (aom_realloc_frame_buffer(
  ------------------
  |  Branch (1361:9): [True: 0, False: 9.88k]
  ------------------
 1362|  9.88k|            frame_to_show, cm->superres_upscaled_width,
 1363|  9.88k|            cm->superres_upscaled_height, seq_params->subsampling_x,
 1364|  9.88k|            seq_params->subsampling_y, seq_params->use_highbitdepth,
 1365|  9.88k|            AOM_BORDER_IN_PIXELS, byte_alignment, fb, cb, cb_priv,
  ------------------
  |  |   32|  9.88k|#define AOM_BORDER_IN_PIXELS 288
  ------------------
 1366|  9.88k|            alloc_pyramid, 0)) {
 1367|      0|      unlock_buffer_pool(pool);
 1368|      0|      aom_internal_error(
 1369|      0|          cm->error, AOM_CODEC_MEM_ERROR,
 1370|      0|          "Failed to allocate current frame buffer for superres upscaling");
 1371|      0|    }
 1372|  9.88k|    unlock_buffer_pool(pool);
 1373|  9.88k|  } else {
 1374|       |    // Make a copy of the config data for frame_to_show in copy_buffer
 1375|      0|    copy_buffer_config(frame_to_show, &copy_buffer);
 1376|       |
 1377|       |    // Don't use callbacks on the encoder.
 1378|       |    // aom_alloc_frame_buffer() clears the config data for frame_to_show
 1379|      0|    if (aom_alloc_frame_buffer(
  ------------------
  |  Branch (1379:9): [True: 0, False: 0]
  ------------------
 1380|      0|            frame_to_show, cm->superres_upscaled_width,
 1381|      0|            cm->superres_upscaled_height, seq_params->subsampling_x,
 1382|      0|            seq_params->subsampling_y, seq_params->use_highbitdepth,
 1383|      0|            AOM_BORDER_IN_PIXELS, byte_alignment, alloc_pyramid, 0))
  ------------------
  |  |   32|      0|#define AOM_BORDER_IN_PIXELS 288
  ------------------
 1384|      0|      aom_internal_error(
 1385|      0|          cm->error, AOM_CODEC_MEM_ERROR,
 1386|      0|          "Failed to reallocate current frame buffer for superres upscaling");
 1387|       |
 1388|       |    // Restore config data back to frame_to_show
 1389|      0|    copy_buffer_config(&copy_buffer, frame_to_show);
 1390|      0|  }
 1391|       |  // TODO(afergs): verify frame_to_show is correct after realloc
 1392|       |  //               encoder:
 1393|       |  //               decoder:
 1394|       |
 1395|  9.88k|  assert(frame_to_show->y_crop_width == cm->superres_upscaled_width);
 1396|  9.88k|  assert(frame_to_show->y_crop_height == cm->superres_upscaled_height);
 1397|       |
 1398|       |  // Scale up and back into frame_to_show.
 1399|  9.88k|  assert(frame_to_show->y_crop_width != cm->width);
 1400|  9.88k|  upscale_normative_and_extend_frame(cm, &copy_buffer, frame_to_show);
 1401|       |
 1402|       |  // Free the copy buffer
 1403|  9.88k|  aom_free_frame_buffer(&copy_buffer);
 1404|  9.88k|}
resize.c:get_upscale_convolve_x0:
  329|  41.5k|                                       int32_t x_step_qn) {
  330|  41.5k|  const int err = out_length * x_step_qn - (in_length << RS_SCALE_SUBPEL_BITS);
  ------------------
  |  |   36|  41.5k|#define RS_SCALE_SUBPEL_BITS 14
  ------------------
  331|  41.5k|  const int32_t x0 =
  332|  41.5k|      (-((out_length - in_length) << (RS_SCALE_SUBPEL_BITS - 1)) +
  ------------------
  |  |   36|  41.5k|#define RS_SCALE_SUBPEL_BITS 14
  ------------------
  333|  41.5k|       out_length / 2) /
  334|  41.5k|          out_length +
  335|  41.5k|      RS_SCALE_EXTRA_OFF - err / 2;
  ------------------
  |  |   39|  41.5k|#define RS_SCALE_EXTRA_OFF (1 << (RS_SCALE_EXTRA_BITS - 1))
  |  |  ------------------
  |  |  |  |   38|  41.5k|#define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   36|  41.5k|#define RS_SCALE_SUBPEL_BITS 14
  |  |  |  |  ------------------
  |  |  |  |               #define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   34|  41.5k|#define RS_SUBPEL_BITS 6
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  336|  41.5k|  return (int32_t)((uint32_t)x0 & RS_SCALE_SUBPEL_MASK);
  ------------------
  |  |   37|  41.5k|#define RS_SCALE_SUBPEL_MASK ((1 << RS_SCALE_SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   36|  41.5k|#define RS_SCALE_SUBPEL_BITS 14
  |  |  ------------------
  ------------------
  337|  41.5k|}
resize.c:highbd_upscale_normative_rect:
  969|  23.8k|                                          int pad_left, int pad_right, int bd) {
  970|  23.8k|  assert(width > 0);
  971|  23.8k|  assert(height > 0);
  972|  23.8k|  assert(width2 > 0);
  973|  23.8k|  assert(height2 > 0);
  974|  23.8k|  assert(height2 == height);
  975|       |
  976|       |  // Extend the left/right pixels of the tile column if needed
  977|       |  // (either because we can't sample from other tiles, or because we're at
  978|       |  // a frame edge).
  979|       |  // Save the overwritten pixels into tmp_left and tmp_right.
  980|       |  // Note: Because we pass input-1 to av1_convolve_horiz_rs, we need one extra
  981|       |  // column of border pixels compared to what we'd naively think.
  982|  23.8k|  const int border_cols = UPSCALE_NORMATIVE_TAPS / 2 + 1;
  ------------------
  |  |  101|  23.8k|#define UPSCALE_NORMATIVE_TAPS 8
  ------------------
  983|  23.8k|  const int border_size = border_cols * sizeof(uint16_t);
  984|  23.8k|  uint16_t *tmp_left =
  985|  23.8k|      NULL;  // Silence spurious "may be used uninitialized" warnings
  986|  23.8k|  uint16_t *tmp_right = NULL;
  987|  23.8k|  uint16_t *const input16 = CONVERT_TO_SHORTPTR(input);
  ------------------
  |  |   75|  23.8k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  988|  23.8k|  uint16_t *const in_tl = input16 - border_cols;
  989|  23.8k|  uint16_t *const in_tr = input16 + width;
  990|  23.8k|  if (pad_left) {
  ------------------
  |  Branch (990:7): [True: 22.3k, False: 1.44k]
  ------------------
  991|  22.3k|    tmp_left = (uint16_t *)aom_malloc(sizeof(*tmp_left) * border_cols * height);
  992|  22.3k|    if (!tmp_left) return false;
  ------------------
  |  Branch (992:9): [True: 0, False: 22.3k]
  ------------------
  993|  1.53M|    for (int i = 0; i < height; i++) {
  ------------------
  |  Branch (993:21): [True: 1.50M, False: 22.3k]
  ------------------
  994|  1.50M|      memcpy(tmp_left + i * border_cols, in_tl + i * in_stride, border_size);
  995|  1.50M|      aom_memset16(in_tl + i * in_stride, input16[i * in_stride], border_cols);
  996|  1.50M|    }
  997|  22.3k|  }
  998|  23.8k|  if (pad_right) {
  ------------------
  |  Branch (998:7): [True: 22.3k, False: 1.44k]
  ------------------
  999|  22.3k|    tmp_right =
 1000|  22.3k|        (uint16_t *)aom_malloc(sizeof(*tmp_right) * border_cols * height);
 1001|  22.3k|    if (!tmp_right) {
  ------------------
  |  Branch (1001:9): [True: 0, False: 22.3k]
  ------------------
 1002|      0|      aom_free(tmp_left);
 1003|      0|      return false;
 1004|      0|    }
 1005|  1.53M|    for (int i = 0; i < height; i++) {
  ------------------
  |  Branch (1005:21): [True: 1.50M, False: 22.3k]
  ------------------
 1006|  1.50M|      memcpy(tmp_right + i * border_cols, in_tr + i * in_stride, border_size);
 1007|  1.50M|      aom_memset16(in_tr + i * in_stride, input16[i * in_stride + width - 1],
 1008|  1.50M|                   border_cols);
 1009|  1.50M|    }
 1010|  22.3k|  }
 1011|       |
 1012|  23.8k|  av1_highbd_convolve_horiz_rs(CONVERT_TO_SHORTPTR(input - 1), in_stride,
  ------------------
  |  |   75|  23.8k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 1013|  23.8k|                               CONVERT_TO_SHORTPTR(output), out_stride, width2,
  ------------------
  |  |   75|  23.8k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 1014|  23.8k|                               height2, &av1_resize_filter_normative[0][0],
 1015|  23.8k|                               x0_qn, x_step_qn, bd);
 1016|       |
 1017|       |  // Restore the left/right border pixels
 1018|  23.8k|  if (pad_left) {
  ------------------
  |  Branch (1018:7): [True: 22.3k, False: 1.44k]
  ------------------
 1019|  1.53M|    for (int i = 0; i < height; i++) {
  ------------------
  |  Branch (1019:21): [True: 1.50M, False: 22.3k]
  ------------------
 1020|  1.50M|      memcpy(in_tl + i * in_stride, tmp_left + i * border_cols, border_size);
 1021|  1.50M|    }
 1022|  22.3k|    aom_free(tmp_left);
 1023|  22.3k|  }
 1024|  23.8k|  if (pad_right) {
  ------------------
  |  Branch (1024:7): [True: 22.3k, False: 1.44k]
  ------------------
 1025|  1.53M|    for (int i = 0; i < height; i++) {
  ------------------
  |  Branch (1025:21): [True: 1.50M, False: 22.3k]
  ------------------
 1026|  1.50M|      memcpy(in_tr + i * in_stride, tmp_right + i * border_cols, border_size);
 1027|  1.50M|    }
 1028|  22.3k|    aom_free(tmp_right);
 1029|  22.3k|  }
 1030|  23.8k|  return true;
 1031|  23.8k|}
resize.c:upscale_normative_rect:
  617|  22.2k|                                   int pad_right) {
  618|  22.2k|  assert(width > 0);
  619|  22.2k|  assert(height > 0);
  620|  22.2k|  assert(width2 > 0);
  621|  22.2k|  assert(height2 > 0);
  622|  22.2k|  assert(height2 == height);
  623|       |
  624|       |  // Extend the left/right pixels of the tile column if needed
  625|       |  // (either because we can't sample from other tiles, or because we're at
  626|       |  // a frame edge).
  627|       |  // Save the overwritten pixels into tmp_left and tmp_right.
  628|       |  // Note: Because we pass input-1 to av1_convolve_horiz_rs, we need one extra
  629|       |  // column of border pixels compared to what we'd naively think.
  630|  22.2k|  const int border_cols = UPSCALE_NORMATIVE_TAPS / 2 + 1;
  ------------------
  |  |  101|  22.2k|#define UPSCALE_NORMATIVE_TAPS 8
  ------------------
  631|  22.2k|  uint8_t *tmp_left =
  632|  22.2k|      NULL;  // Silence spurious "may be used uninitialized" warnings
  633|  22.2k|  uint8_t *tmp_right = NULL;
  634|  22.2k|  uint8_t *const in_tl = (uint8_t *)(input - border_cols);  // Cast off 'const'
  635|  22.2k|  uint8_t *const in_tr = (uint8_t *)(input + width);
  636|  22.2k|  if (pad_left) {
  ------------------
  |  Branch (636:7): [True: 19.2k, False: 2.99k]
  ------------------
  637|  19.2k|    tmp_left = (uint8_t *)aom_malloc(sizeof(*tmp_left) * border_cols * height);
  638|  19.2k|    if (!tmp_left) return false;
  ------------------
  |  Branch (638:9): [True: 0, False: 19.2k]
  ------------------
  639|   601k|    for (int i = 0; i < height; i++) {
  ------------------
  |  Branch (639:21): [True: 582k, False: 19.2k]
  ------------------
  640|   582k|      memcpy(tmp_left + i * border_cols, in_tl + i * in_stride, border_cols);
  641|   582k|      memset(in_tl + i * in_stride, input[i * in_stride], border_cols);
  642|   582k|    }
  643|  19.2k|  }
  644|  22.2k|  if (pad_right) {
  ------------------
  |  Branch (644:7): [True: 19.2k, False: 2.99k]
  ------------------
  645|  19.2k|    tmp_right =
  646|  19.2k|        (uint8_t *)aom_malloc(sizeof(*tmp_right) * border_cols * height);
  647|  19.2k|    if (!tmp_right) {
  ------------------
  |  Branch (647:9): [True: 0, False: 19.2k]
  ------------------
  648|      0|      aom_free(tmp_left);
  649|      0|      return false;
  650|      0|    }
  651|   601k|    for (int i = 0; i < height; i++) {
  ------------------
  |  Branch (651:21): [True: 582k, False: 19.2k]
  ------------------
  652|   582k|      memcpy(tmp_right + i * border_cols, in_tr + i * in_stride, border_cols);
  653|   582k|      memset(in_tr + i * in_stride, input[i * in_stride + width - 1],
  654|   582k|             border_cols);
  655|   582k|    }
  656|  19.2k|  }
  657|       |
  658|  22.2k|  av1_convolve_horiz_rs(input - 1, in_stride, output, out_stride, width2,
  659|  22.2k|                        height2, &av1_resize_filter_normative[0][0], x0_qn,
  660|  22.2k|                        x_step_qn);
  661|       |
  662|       |  // Restore the left/right border pixels
  663|  22.2k|  if (pad_left) {
  ------------------
  |  Branch (663:7): [True: 19.2k, False: 2.99k]
  ------------------
  664|   601k|    for (int i = 0; i < height; i++) {
  ------------------
  |  Branch (664:21): [True: 582k, False: 19.2k]
  ------------------
  665|   582k|      memcpy(in_tl + i * in_stride, tmp_left + i * border_cols, border_cols);
  666|   582k|    }
  667|  19.2k|    aom_free(tmp_left);
  668|  19.2k|  }
  669|  22.2k|  if (pad_right) {
  ------------------
  |  Branch (669:7): [True: 19.2k, False: 2.99k]
  ------------------
  670|   601k|    for (int i = 0; i < height; i++) {
  ------------------
  |  Branch (670:21): [True: 582k, False: 19.2k]
  ------------------
  671|   582k|      memcpy(in_tr + i * in_stride, tmp_right + i * border_cols, border_cols);
  672|   582k|    }
  673|  19.2k|    aom_free(tmp_right);
  674|  19.2k|  }
  675|  22.2k|  return true;
  676|  22.2k|}
resize.c:calculate_scaled_size_helper:
 1273|  37.8k|static void calculate_scaled_size_helper(int *dim, int denom) {
 1274|  37.8k|  if (denom != SCALE_NUMERATOR) {
  ------------------
  |  |   22|  37.8k|#define SCALE_NUMERATOR 8
  ------------------
  |  Branch (1274:7): [True: 37.8k, False: 0]
  ------------------
 1275|       |    // We need to ensure the constraint in "Appendix A" of the spec:
 1276|       |    // * FrameWidth is greater than or equal to 16
 1277|       |    // * FrameHeight is greater than or equal to 16
 1278|       |    // For this, we clamp the downscaled dimension to at least 16. One
 1279|       |    // exception: if original dimension itself was < 16, then we keep the
 1280|       |    // downscaled dimension to be same as the original, to ensure that resizing
 1281|       |    // is valid.
 1282|  37.8k|    const int min_dim = AOMMIN(16, *dim);
  ------------------
  |  |   34|  37.8k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 22.5k, False: 15.3k]
  |  |  ------------------
  ------------------
 1283|       |    // Use this version if we need *dim to be even
 1284|       |    // *width = (*width * SCALE_NUMERATOR + denom) / (2 * denom);
 1285|       |    // *width <<= 1;
 1286|  37.8k|    *dim = (*dim * SCALE_NUMERATOR + denom / 2) / (denom);
  ------------------
  |  |   22|  37.8k|#define SCALE_NUMERATOR 8
  ------------------
 1287|  37.8k|    *dim = AOMMAX(*dim, min_dim);
  ------------------
  |  |   35|  37.8k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 21.2k, False: 16.6k]
  |  |  ------------------
  ------------------
 1288|  37.8k|  }
 1289|  37.8k|}
resize.c:upscale_normative_and_extend_frame:
 1191|  9.88k|                                               YV12_BUFFER_CONFIG *dst) {
 1192|  9.88k|  const int num_planes = av1_num_planes(cm);
 1193|  27.3k|  for (int i = 0; i < num_planes; ++i) {
  ------------------
  |  Branch (1193:19): [True: 17.4k, False: 9.88k]
  ------------------
 1194|  17.4k|    const int is_uv = (i > 0);
 1195|  17.4k|    av1_upscale_normative_rows(cm, src->buffers[i], src->strides[is_uv],
 1196|  17.4k|                               dst->buffers[i], dst->strides[is_uv], i,
 1197|  17.4k|                               src->crop_heights[is_uv]);
 1198|  17.4k|  }
 1199|       |
 1200|  9.88k|  aom_extend_frame_borders(dst, num_planes);
  ------------------
  |  |   31|  9.88k|#define aom_extend_frame_borders aom_extend_frame_borders_c
  ------------------
 1201|  9.88k|}

decodeframe.c:av1_superres_scaled:
   66|   239k|static inline int av1_superres_scaled(const AV1_COMMON *cm) {
   67|       |  // Note: for some corner cases (e.g. cm->width of 1), there may be no scaling
   68|       |  // required even though cm->superres_scale_denominator != SCALE_NUMERATOR.
   69|       |  // So, the following check is more accurate.
   70|   239k|  return (cm->width != cm->superres_upscaled_width);
   71|   239k|}
resize.c:av1_superres_scaled:
   66|  9.88k|static inline int av1_superres_scaled(const AV1_COMMON *cm) {
   67|       |  // Note: for some corner cases (e.g. cm->width of 1), there may be no scaling
   68|       |  // required even though cm->superres_scale_denominator != SCALE_NUMERATOR.
   69|       |  // So, the following check is more accurate.
   70|  9.88k|  return (cm->width != cm->superres_upscaled_width);
   71|  9.88k|}
restoration.c:av1_superres_scaled:
   66|  2.11M|static inline int av1_superres_scaled(const AV1_COMMON *cm) {
   67|       |  // Note: for some corner cases (e.g. cm->width of 1), there may be no scaling
   68|       |  // required even though cm->superres_scale_denominator != SCALE_NUMERATOR.
   69|       |  // So, the following check is more accurate.
   70|  2.11M|  return (cm->width != cm->superres_upscaled_width);
   71|  2.11M|}
tile_common.c:av1_superres_scaled:
   66|  19.8k|static inline int av1_superres_scaled(const AV1_COMMON *cm) {
   67|       |  // Note: for some corner cases (e.g. cm->width of 1), there may be no scaling
   68|       |  // required even though cm->superres_scale_denominator != SCALE_NUMERATOR.
   69|       |  // So, the following check is more accurate.
   70|  19.8k|  return (cm->width != cm->superres_upscaled_width);
   71|  19.8k|}

av1_get_upsampled_plane_size:
   48|   241k|                                  int *plane_h) {
   49|   241k|  int ss_x = is_uv && cm->seq_params->subsampling_x;
  ------------------
  |  Branch (49:14): [True: 147k, False: 94.3k]
  |  Branch (49:23): [True: 134k, False: 12.9k]
  ------------------
   50|   241k|  int ss_y = is_uv && cm->seq_params->subsampling_y;
  ------------------
  |  Branch (50:14): [True: 147k, False: 94.3k]
  |  Branch (50:23): [True: 113k, False: 33.9k]
  ------------------
   51|   241k|  *plane_w = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
  ------------------
  |  |   41|   241k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
   52|   241k|  *plane_h = ROUND_POWER_OF_TWO(cm->height, ss_y);
  ------------------
  |  |   41|   241k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
   53|   241k|}
av1_lr_count_units:
   63|   233k|int av1_lr_count_units(int unit_size, int plane_size) {
   64|   233k|  return AOMMAX((plane_size + (unit_size >> 1)) / unit_size, 1);
  ------------------
  |  |   35|   233k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 75.8k, False: 157k]
  |  |  ------------------
  ------------------
   65|   233k|}
av1_alloc_restoration_struct:
   68|  81.7k|                                  int is_uv) {
   69|  81.7k|  int plane_w, plane_h;
   70|  81.7k|  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
   71|       |
   72|  81.7k|  const int unit_size = rsi->restoration_unit_size;
   73|  81.7k|  const int horz_units = av1_lr_count_units(unit_size, plane_w);
   74|  81.7k|  const int vert_units = av1_lr_count_units(unit_size, plane_h);
   75|       |
   76|  81.7k|  rsi->num_rest_units = horz_units * vert_units;
   77|  81.7k|  rsi->horz_units = horz_units;
   78|  81.7k|  rsi->vert_units = vert_units;
   79|       |
   80|  81.7k|  aom_free(rsi->unit_info);
   81|  81.7k|  CHECK_MEM_ERROR(cm, rsi->unit_info,
  ------------------
  |  |   51|  81.7k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  81.7k|  do {                                                    \
  |  |  |  |   69|  81.7k|    lval = (expr);                                        \
  |  |  |  |   70|  81.7k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 81.7k]
  |  |  |  |  ------------------
  |  |  |  |   71|  81.7k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  81.7k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   82|  81.7k|                  (RestorationUnitInfo *)aom_memalign(
   83|  81.7k|                      16, sizeof(*rsi->unit_info) * rsi->num_rest_units));
   84|  81.7k|}
av1_free_restoration_struct:
   86|  48.3k|void av1_free_restoration_struct(RestorationInfo *rst_info) {
   87|  48.3k|  aom_free(rst_info->unit_info);
   88|  48.3k|  rst_info->unit_info = NULL;
   89|  48.3k|}
av1_loop_restoration_precal:
  116|  16.1k|void av1_loop_restoration_precal(void) {
  117|       |#if 0
  118|       |  GenSgrprojVtable();
  119|       |#endif
  120|  16.1k|}
av1_extend_frame:
  173|  46.2k|                      int border_horz, int border_vert, int highbd) {
  174|  46.2k|#if CONFIG_AV1_HIGHBITDEPTH
  175|  46.2k|  if (highbd) {
  ------------------
  |  Branch (175:7): [True: 25.3k, False: 20.9k]
  ------------------
  176|  25.3k|    extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
  ------------------
  |  |   75|  25.3k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  177|  25.3k|                        border_horz, border_vert);
  178|  25.3k|    return;
  179|  25.3k|  }
  180|  20.9k|#endif
  181|  20.9k|  (void)highbd;
  182|  20.9k|  extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
  183|  20.9k|}
av1_decode_xq:
  584|   551k|void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
  585|   551k|  if (params->r[0] == 0) {
  ------------------
  |  Branch (585:7): [True: 80.2k, False: 471k]
  ------------------
  586|  80.2k|    xq[0] = 0;
  587|  80.2k|    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
  ------------------
  |  |   99|  80.2k|#define SGRPROJ_PRJ_BITS 7
  ------------------
  588|   471k|  } else if (params->r[1] == 0) {
  ------------------
  |  Branch (588:14): [True: 31.0k, False: 440k]
  ------------------
  589|  31.0k|    xq[0] = xqd[0];
  590|  31.0k|    xq[1] = 0;
  591|   440k|  } else {
  592|   440k|    xq[0] = xqd[0];
  593|   440k|    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
  ------------------
  |  |   99|   440k|#define SGRPROJ_PRJ_BITS 7
  ------------------
  594|   440k|  }
  595|   551k|}
av1_loop_restoration_filter_unit:
  992|   121k|    int optimized_lr, struct aom_internal_error_info *error_info) {
  993|   121k|  RestorationType unit_rtype = rui->restoration_type;
  994|       |
  995|   121k|  int unit_h = limits->v_end - limits->v_start;
  996|   121k|  int unit_w = limits->h_end - limits->h_start;
  997|   121k|  uint8_t *data8_tl =
  998|   121k|      data8 + limits->v_start * (ptrdiff_t)stride + limits->h_start;
  999|   121k|  uint8_t *dst8_tl =
 1000|   121k|      dst8 + limits->v_start * (ptrdiff_t)dst_stride + limits->h_start;
 1001|       |
 1002|   121k|  if (unit_rtype == RESTORE_NONE) {
  ------------------
  |  Branch (1002:7): [True: 43.6k, False: 77.4k]
  ------------------
 1003|  43.6k|    copy_rest_unit(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride,
 1004|  43.6k|                   highbd);
 1005|  43.6k|    return;
 1006|  43.6k|  }
 1007|       |
 1008|  77.4k|  const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
 1009|  77.4k|  assert(filter_idx < NUM_STRIPE_FILTERS);
 1010|  77.5k|  const stripe_filter_fun stripe_filter = stripe_filters[filter_idx];
 1011|       |
 1012|  77.5k|  const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
  ------------------
  |  |   34|  77.5k|#define RESTORATION_PROC_UNIT_SIZE 64
  ------------------
 1013|       |
 1014|       |  // Filter the whole image one stripe at a time
 1015|  77.5k|  RestorationTileLimits remaining_stripes = *limits;
 1016|  77.5k|  int i = 0;
 1017|   349k|  while (i < unit_h) {
  ------------------
  |  Branch (1017:10): [True: 271k, False: 77.5k]
  ------------------
 1018|   271k|    int copy_above, copy_below;
 1019|   271k|    remaining_stripes.v_start = limits->v_start + i;
 1020|       |
 1021|   271k|    get_stripe_boundary_info(&remaining_stripes, plane_w, plane_h, ss_y,
 1022|   271k|                             &copy_above, &copy_below);
 1023|       |
 1024|   271k|    const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
  ------------------
  |  |   34|   271k|#define RESTORATION_PROC_UNIT_SIZE 64
  ------------------
 1025|   271k|    const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
  ------------------
  |  |   37|   271k|#define RESTORATION_UNIT_OFFSET 8
  ------------------
 1026|       |
 1027|       |    // Work out where this stripe's boundaries are within
 1028|       |    // rsb->stripe_boundary_{above,below}
 1029|   271k|    const int frame_stripe =
 1030|   271k|        (remaining_stripes.v_start + runit_offset) / full_stripe_height;
 1031|   271k|    const int rsb_row = RESTORATION_CTX_VERT * frame_stripe;
  ------------------
  |  |   66|   271k|#define RESTORATION_CTX_VERT 2
  ------------------
 1032|       |
 1033|       |    // Calculate this stripe's height, based on two rules:
 1034|       |    // * The topmost stripe in the frame is 8 luma pixels shorter than usual.
 1035|       |    // * We can't extend past the end of the current restoration unit
 1036|   271k|    const int nominal_stripe_height =
 1037|   271k|        full_stripe_height - ((frame_stripe == 0) ? runit_offset : 0);
  ------------------
  |  Branch (1037:31): [True: 37.3k, False: 234k]
  ------------------
 1038|   271k|    const int h = AOMMIN(nominal_stripe_height,
  ------------------
  |  |   34|   271k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 194k, False: 77.5k]
  |  |  ------------------
  ------------------
 1039|   271k|                         remaining_stripes.v_end - remaining_stripes.v_start);
 1040|       |
 1041|   271k|    setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
 1042|   271k|                                     h, data8, stride, rlbs, copy_above,
 1043|   271k|                                     copy_below, optimized_lr);
 1044|       |
 1045|   271k|    stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
 1046|   271k|                  dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth,
 1047|   271k|                  error_info);
 1048|       |
 1049|   271k|    restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
 1050|   271k|                                       data8, stride, copy_above, copy_below,
 1051|   271k|                                       optimized_lr);
 1052|       |
 1053|   271k|    i += h;
 1054|   271k|  }
 1055|  77.5k|}
av1_loop_restoration_filter_frame_init:
 1074|  22.1k|                                            int num_planes) {
 1075|  22.1k|  const SequenceHeader *const seq_params = cm->seq_params;
 1076|  22.1k|  const int bit_depth = seq_params->bit_depth;
 1077|  22.1k|  const int highbd = seq_params->use_highbitdepth;
 1078|  22.1k|  lr_ctxt->dst = &cm->rst_frame;
 1079|       |
 1080|  22.1k|  const int frame_width = frame->crop_widths[0];
 1081|  22.1k|  const int frame_height = frame->crop_heights[0];
 1082|  22.1k|  if (aom_realloc_frame_buffer(
  ------------------
  |  Branch (1082:7): [True: 0, False: 22.1k]
  ------------------
 1083|  22.1k|          lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
 1084|  22.1k|          seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
  ------------------
  |  |   30|  22.1k|#define AOM_RESTORATION_FRAME_BORDER 32
  ------------------
 1085|  22.1k|          cm->features.byte_alignment, NULL, NULL, NULL, false,
 1086|  22.1k|          0) != AOM_CODEC_OK)
 1087|      0|    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
 1088|      0|                       "Failed to allocate restoration dst buffer");
 1089|       |
 1090|  22.1k|  lr_ctxt->on_rest_unit = filter_frame_on_unit;
 1091|  22.1k|  lr_ctxt->frame = frame;
 1092|  81.6k|  for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (1092:23): [True: 59.5k, False: 22.1k]
  ------------------
 1093|  59.5k|    RestorationInfo *rsi = &cm->rst_info[plane];
 1094|  59.5k|    RestorationType rtype = rsi->frame_restoration_type;
 1095|  59.5k|    rsi->optimized_lr = optimized_lr;
 1096|  59.5k|    lr_ctxt->ctxt[plane].rsi = rsi;
 1097|       |
 1098|  59.5k|    if (rtype == RESTORE_NONE) {
  ------------------
  |  Branch (1098:9): [True: 13.2k, False: 46.2k]
  ------------------
 1099|  13.2k|      continue;
 1100|  13.2k|    }
 1101|       |
 1102|  46.2k|    const int is_uv = plane > 0;
 1103|  46.2k|    int plane_w, plane_h;
 1104|  46.2k|    av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
 1105|  46.2k|    assert(plane_w == frame->crop_widths[is_uv]);
 1106|  46.2k|    assert(plane_h == frame->crop_heights[is_uv]);
 1107|       |
 1108|  46.2k|    av1_extend_frame(frame->buffers[plane], plane_w, plane_h,
 1109|  46.2k|                     frame->strides[is_uv], RESTORATION_BORDER,
  ------------------
  |  |   62|  46.2k|#define RESTORATION_BORDER 3
  ------------------
 1110|  46.2k|                     RESTORATION_BORDER, highbd);
  ------------------
  |  |   62|  46.2k|#define RESTORATION_BORDER 3
  ------------------
 1111|       |
 1112|  46.2k|    FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
 1113|  46.2k|    lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
  ------------------
  |  Branch (1113:27): [True: 26.2k, False: 20.0k]
  |  Branch (1113:36): [True: 23.9k, False: 2.23k]
  ------------------
 1114|  46.2k|    lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
  ------------------
  |  Branch (1114:27): [True: 26.2k, False: 20.0k]
  |  Branch (1114:36): [True: 19.6k, False: 6.51k]
  ------------------
 1115|  46.2k|    lr_plane_ctxt->plane_w = plane_w;
 1116|  46.2k|    lr_plane_ctxt->plane_h = plane_h;
 1117|  46.2k|    lr_plane_ctxt->highbd = highbd;
 1118|  46.2k|    lr_plane_ctxt->bit_depth = bit_depth;
 1119|  46.2k|    lr_plane_ctxt->data8 = frame->buffers[plane];
 1120|  46.2k|    lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane];
 1121|  46.2k|    lr_plane_ctxt->data_stride = frame->strides[is_uv];
 1122|  46.2k|    lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
 1123|  46.2k|  }
 1124|  22.1k|}
av1_loop_restoration_filter_frame:
 1199|  6.40k|                                       void *lr_ctxt) {
 1200|  6.40k|  assert(!cm->features.all_lossless);
 1201|  6.40k|  const int num_planes = av1_num_planes(cm);
 1202|       |
 1203|  6.40k|  AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
 1204|       |
 1205|  6.40k|  av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
 1206|  6.40k|                                         optimized_lr, num_planes);
 1207|       |
 1208|  6.40k|  foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes);
 1209|       |
 1210|  6.40k|  loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes);
 1211|  6.40k|}
av1_foreach_rest_unit_in_row:
 1219|  65.9k|    struct aom_internal_error_info *error_info) {
 1220|  65.9k|  const int ext_size = unit_size * 3 / 2;
 1221|  65.9k|  int x0 = 0, j = 0;
 1222|   187k|  while (x0 < plane_w) {
  ------------------
  |  Branch (1222:10): [True: 121k, False: 65.9k]
  ------------------
 1223|   121k|    int remaining_w = plane_w - x0;
 1224|   121k|    int w = (remaining_w < ext_size) ? remaining_w : unit_size;
  ------------------
  |  Branch (1224:13): [True: 65.9k, False: 55.2k]
  ------------------
 1225|       |
 1226|   121k|    limits->h_start = x0;
 1227|   121k|    limits->h_end = x0 + w;
 1228|   121k|    assert(limits->h_end <= plane_w);
 1229|       |
 1230|   121k|    const int unit_idx = row_number * hnum_rest_units + j;
 1231|       |
 1232|       |    // No sync for even numbered rows
 1233|       |    // For odd numbered rows, Loop Restoration of current block requires the LR
 1234|       |    // of top-right and bottom-right blocks to be completed
 1235|       |
 1236|       |    // top-right sync
 1237|   121k|    on_sync_read(lr_sync, row_number, j, plane);
 1238|   121k|    if ((row_number + 1) < vnum_rest_units)
  ------------------
  |  Branch (1238:9): [True: 53.9k, False: 67.2k]
  ------------------
 1239|       |      // bottom-right sync
 1240|  53.9k|      on_sync_read(lr_sync, row_number + 2, j, plane);
 1241|       |
 1242|   121k|#if CONFIG_MULTITHREAD
 1243|   121k|    if (lr_sync && lr_sync->num_workers > 1) {
  ------------------
  |  Branch (1243:9): [True: 102k, False: 18.7k]
  |  Branch (1243:20): [True: 102k, False: 18.4E]
  ------------------
 1244|   102k|      pthread_mutex_lock(lr_sync->job_mutex);
 1245|   102k|      const bool lr_mt_exit = lr_sync->lr_mt_exit;
 1246|   102k|      pthread_mutex_unlock(lr_sync->job_mutex);
 1247|       |      // Exit in case any worker has encountered an error.
 1248|   102k|      if (lr_mt_exit) return;
  ------------------
  |  Branch (1248:11): [True: 0, False: 102k]
  ------------------
 1249|   102k|    }
 1250|   121k|#endif
 1251|       |
 1252|   121k|    on_rest_unit(limits, unit_idx, priv, tmpbuf, rlbs, error_info);
 1253|       |
 1254|   121k|    on_sync_write(lr_sync, row_number, j, hnum_rest_units, plane);
 1255|       |
 1256|   121k|    x0 += w;
 1257|   121k|    ++j;
 1258|   121k|  }
 1259|  65.9k|}
av1_lr_sync_read_dummy:
 1261|   127k|void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) {
 1262|   127k|  (void)lr_sync;
 1263|   127k|  (void)r;
 1264|   127k|  (void)c;
 1265|   127k|  (void)plane;
 1266|   127k|}
av1_lr_sync_write_dummy:
 1269|  53.8k|                             const int sb_cols, int plane) {
 1270|  53.8k|  (void)lr_sync;
 1271|  53.8k|  (void)r;
 1272|  53.8k|  (void)c;
 1273|  53.8k|  (void)sb_cols;
 1274|  53.8k|  (void)plane;
 1275|  53.8k|}
av1_loop_restoration_corners_in_sb:
 1280|  17.3M|                                       int *rrow1) {
 1281|  17.3M|  assert(rcol0 && rcol1 && rrow0 && rrow1);
 1282|       |
 1283|  17.3M|  if (bsize != cm->seq_params->sb_size) return 0;
  ------------------
  |  Branch (1283:7): [True: 16.4M, False: 866k]
  ------------------
 1284|       |
 1285|   866k|  assert(!cm->features.all_lossless);
 1286|       |
 1287|   868k|  const int is_uv = plane > 0;
 1288|       |
 1289|       |  // Compute the mi-unit corners of the superblock
 1290|   868k|  const int mi_row0 = mi_row;
 1291|   868k|  const int mi_col0 = mi_col;
 1292|   868k|  const int mi_row1 = mi_row0 + mi_size_high[bsize];
 1293|   868k|  const int mi_col1 = mi_col0 + mi_size_wide[bsize];
 1294|       |
 1295|   868k|  const RestorationInfo *rsi = &cm->rst_info[plane];
 1296|   868k|  const int size = rsi->restoration_unit_size;
 1297|   868k|  const int horz_units = rsi->horz_units;
 1298|   868k|  const int vert_units = rsi->vert_units;
 1299|       |
 1300|       |  // The size of an MI-unit on this plane of the image
 1301|   868k|  const int ss_x = is_uv && cm->seq_params->subsampling_x;
  ------------------
  |  Branch (1301:20): [True: 551k, False: 316k]
  |  Branch (1301:29): [True: 517k, False: 34.5k]
  ------------------
 1302|   868k|  const int ss_y = is_uv && cm->seq_params->subsampling_y;
  ------------------
  |  Branch (1302:20): [True: 551k, False: 316k]
  |  Branch (1302:29): [True: 510k, False: 41.1k]
  ------------------
 1303|   868k|  const int mi_size_x = MI_SIZE >> ss_x;
  ------------------
  |  |   40|   868k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   868k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1304|   868k|  const int mi_size_y = MI_SIZE >> ss_y;
  ------------------
  |  |   40|   868k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   868k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1305|       |
 1306|       |  // Write m for the relative mi column or row, D for the superres denominator
 1307|       |  // and N for the superres numerator. If u is the upscaled pixel offset then
 1308|       |  // we can write the downscaled pixel offset in two ways as:
 1309|       |  //
 1310|       |  //   MI_SIZE * m = N / D u
 1311|       |  //
 1312|       |  // from which we get u = D * MI_SIZE * m / N
 1313|   868k|  const int mi_to_num_x = av1_superres_scaled(cm)
  ------------------
  |  Branch (1313:27): [True: 29.7k, False: 838k]
  ------------------
 1314|   868k|                              ? mi_size_x * cm->superres_scale_denominator
 1315|   868k|                              : mi_size_x;
 1316|   868k|  const int mi_to_num_y = mi_size_y;
 1317|   868k|  const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size;
  ------------------
  |  |   22|  29.7k|#define SCALE_NUMERATOR 8
  ------------------
  |  Branch (1317:23): [True: 29.7k, False: 838k]
  ------------------
 1318|   868k|  const int denom_y = size;
 1319|       |
 1320|   868k|  const int rnd_x = denom_x - 1;
 1321|   868k|  const int rnd_y = denom_y - 1;
 1322|       |
 1323|       |  // rcol0/rrow0 should be the first column/row of restoration units that
 1324|       |  // doesn't start left/below of mi_col/mi_row. For this calculation, we need
 1325|       |  // to round up the division (if the sb starts at runit column 10.1, the first
 1326|       |  // matching runit has column index 11)
 1327|   868k|  *rcol0 = (mi_col0 * mi_to_num_x + rnd_x) / denom_x;
 1328|   868k|  *rrow0 = (mi_row0 * mi_to_num_y + rnd_y) / denom_y;
 1329|       |
 1330|       |  // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
 1331|       |  // below-right. If we're at the bottom or right of the frame, this restoration
 1332|       |  // unit might not exist, in which case we'll clamp accordingly.
 1333|   868k|  *rcol1 = AOMMIN((mi_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
  ------------------
  |  |   34|   868k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 632k, False: 236k]
  |  |  ------------------
  ------------------
 1334|   868k|  *rrow1 = AOMMIN((mi_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
  ------------------
  |  |   34|   868k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 533k, False: 334k]
  |  |  ------------------
  ------------------
 1335|       |
 1336|   868k|  return *rcol0 < *rcol1 && *rrow0 < *rrow1;
  ------------------
  |  Branch (1336:10): [True: 409k, False: 458k]
  |  Branch (1336:29): [True: 252k, False: 157k]
  ------------------
 1337|   866k|}
av1_loop_restoration_save_boundary_lines:
 1498|  38.8k|                                              AV1_COMMON *cm, int after_cdef) {
 1499|  38.8k|  const int num_planes = av1_num_planes(cm);
 1500|  38.8k|  const int use_highbd = cm->seq_params->use_highbitdepth;
 1501|   142k|  for (int p = 0; p < num_planes; ++p) {
  ------------------
  |  Branch (1501:19): [True: 103k, False: 38.8k]
  ------------------
 1502|   103k|    save_boundary_lines(frame, use_highbd, p, cm, after_cdef);
 1503|   103k|  }
 1504|  38.8k|}
restoration.c:extend_frame_highbd:
  145|  25.3k|                                int border_vert) {
  146|  25.3k|  uint16_t *data_p;
  147|  25.3k|  int i, j;
  148|  3.40M|  for (i = 0; i < height; ++i) {
  ------------------
  |  Branch (148:15): [True: 3.37M, False: 25.3k]
  ------------------
  149|  3.37M|    data_p = data + i * stride;
  150|  13.5M|    for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
  ------------------
  |  Branch (150:28): [True: 10.1M, False: 3.37M]
  ------------------
  151|  13.5M|    for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
  ------------------
  |  Branch (151:21): [True: 10.1M, False: 3.37M]
  ------------------
  152|  3.37M|  }
  153|  25.3k|  data_p = data - border_horz;
  154|   101k|  for (i = -border_vert; i < 0; ++i) {
  ------------------
  |  Branch (154:26): [True: 76.0k, False: 25.3k]
  ------------------
  155|  76.0k|    memcpy(data_p + i * stride, data_p,
  156|  76.0k|           (width + 2 * border_horz) * sizeof(uint16_t));
  157|  76.0k|  }
  158|   101k|  for (i = height; i < height + border_vert; ++i) {
  ------------------
  |  Branch (158:20): [True: 76.0k, False: 25.3k]
  ------------------
  159|  76.0k|    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
  160|  76.0k|           (width + 2 * border_horz) * sizeof(uint16_t));
  161|  76.0k|  }
  162|  25.3k|}
restoration.c:extend_frame_lowbd:
  124|  20.9k|                               int border_vert) {
  125|  20.9k|  uint8_t *data_p;
  126|  20.9k|  int i;
  127|  2.39M|  for (i = 0; i < height; ++i) {
  ------------------
  |  Branch (127:15): [True: 2.37M, False: 20.9k]
  ------------------
  128|  2.37M|    data_p = data + i * stride;
  129|  2.37M|    memset(data_p - border_horz, data_p[0], border_horz);
  130|  2.37M|    memset(data_p + width, data_p[width - 1], border_horz);
  131|  2.37M|  }
  132|  20.9k|  data_p = data - border_horz;
  133|  83.7k|  for (i = -border_vert; i < 0; ++i) {
  ------------------
  |  Branch (133:26): [True: 62.8k, False: 20.9k]
  ------------------
  134|  62.8k|    memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
  135|  62.8k|  }
  136|  83.7k|  for (i = height; i < height + border_vert; ++i) {
  ------------------
  |  Branch (136:20): [True: 62.8k, False: 20.9k]
  ------------------
  137|  62.8k|    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
  138|  62.8k|           width + 2 * border_horz);
  139|  62.8k|  }
  140|  20.9k|}
restoration.c:copy_rest_unit:
  193|  43.6k|                           int highbd) {
  194|  43.6k|#if CONFIG_AV1_HIGHBITDEPTH
  195|  43.6k|  if (highbd) {
  ------------------
  |  Branch (195:7): [True: 15.5k, False: 28.1k]
  ------------------
  196|  15.5k|    copy_rest_unit_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
  ------------------
  |  |   75|  15.5k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  197|  15.5k|                          CONVERT_TO_SHORTPTR(dst), dst_stride);
  ------------------
  |  |   75|  15.5k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  198|  15.5k|    return;
  199|  15.5k|  }
  200|  28.1k|#endif
  201|  28.1k|  (void)highbd;
  202|  28.1k|  copy_rest_unit_lowbd(width, height, src, src_stride, dst, dst_stride);
  203|  28.1k|}
restoration.c:copy_rest_unit_highbd:
  166|  15.5k|                                  int dst_stride) {
  167|  1.63M|  for (int i = 0; i < height; ++i)
  ------------------
  |  Branch (167:19): [True: 1.61M, False: 15.5k]
  ------------------
  168|  1.61M|    memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
  169|  15.5k|}
restoration.c:copy_rest_unit_lowbd:
  186|  28.1k|                                 int src_stride, uint8_t *dst, int dst_stride) {
  187|  2.14M|  for (int i = 0; i < height; ++i)
  ------------------
  |  Branch (187:19): [True: 2.11M, False: 28.1k]
  ------------------
  188|  2.11M|    memcpy(dst + i * dst_stride, src + i * src_stride, width);
  189|  28.1k|}
restoration.c:wiener_filter_stripe:
  394|  45.1k|                                 struct aom_internal_error_info *error_info) {
  395|  45.1k|  (void)tmpbuf;
  396|  45.1k|  (void)bit_depth;
  397|  45.1k|  (void)error_info;
  398|  45.1k|  assert(bit_depth == 8);
  399|  45.1k|  const WienerConvolveParams conv_params = get_conv_params_wiener(8);
  400|       |
  401|   349k|  for (int j = 0; j < stripe_width; j += procunit_width) {
  ------------------
  |  Branch (401:19): [True: 304k, False: 45.1k]
  ------------------
  402|   304k|    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
  ------------------
  |  |   34|   304k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 259k, False: 44.9k]
  |  |  ------------------
  ------------------
  403|   304k|    const uint8_t *src_p = src + j;
  404|   304k|    uint8_t *dst_p = dst + j;
  405|   304k|    av1_wiener_convolve_add_src(
  406|   304k|        src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16,
  407|   304k|        rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params);
  408|   304k|  }
  409|  45.1k|}
restoration.c:sgrproj_filter_stripe:
  909|  78.1k|                                  struct aom_internal_error_info *error_info) {
  910|  78.1k|  (void)bit_depth;
  911|  78.1k|  assert(bit_depth == 8);
  912|       |
  913|   342k|  for (int j = 0; j < stripe_width; j += procunit_width) {
  ------------------
  |  Branch (913:19): [True: 264k, False: 78.1k]
  ------------------
  914|   264k|    int w = AOMMIN(procunit_width, stripe_width - j);
  ------------------
  |  |   34|   264k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 186k, False: 77.9k]
  |  |  ------------------
  ------------------
  915|   264k|    if (av1_apply_selfguided_restoration(
  ------------------
  |  Branch (915:9): [True: 0, False: 264k]
  ------------------
  916|   264k|            src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
  917|   264k|            rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth,
  918|   264k|            0) != 0) {
  919|      0|      aom_internal_error(
  920|      0|          error_info, AOM_CODEC_MEM_ERROR,
  921|      0|          "Error allocating buffer in av1_apply_selfguided_restoration");
  922|      0|    }
  923|   264k|  }
  924|  78.1k|}
restoration.c:wiener_filter_stripe_highbd:
  931|  54.6k|    struct aom_internal_error_info *error_info) {
  932|  54.6k|  (void)tmpbuf;
  933|  54.6k|  (void)error_info;
  934|  54.6k|  const WienerConvolveParams conv_params = get_conv_params_wiener(bit_depth);
  935|       |
  936|   420k|  for (int j = 0; j < stripe_width; j += procunit_width) {
  ------------------
  |  Branch (936:19): [True: 366k, False: 54.6k]
  ------------------
  937|   366k|    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
  ------------------
  |  |   34|   366k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 313k, False: 52.8k]
  |  |  ------------------
  ------------------
  938|   366k|    const uint8_t *src8_p = src8 + j;
  939|   366k|    uint8_t *dst8_p = dst8 + j;
  940|   366k|    av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride,
  941|   366k|                                       rui->wiener_info.hfilter, 16,
  942|   366k|                                       rui->wiener_info.vfilter, 16, w,
  943|   366k|                                       stripe_height, &conv_params, bit_depth);
  944|   366k|  }
  945|  54.6k|}
restoration.c:sgrproj_filter_stripe_highbd:
  951|  93.8k|    struct aom_internal_error_info *error_info) {
  952|   380k|  for (int j = 0; j < stripe_width; j += procunit_width) {
  ------------------
  |  Branch (952:19): [True: 286k, False: 93.8k]
  ------------------
  953|   286k|    int w = AOMMIN(procunit_width, stripe_width - j);
  ------------------
  |  |   34|   286k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 192k, False: 93.6k]
  |  |  ------------------
  ------------------
  954|   286k|    if (av1_apply_selfguided_restoration(
  ------------------
  |  Branch (954:9): [True: 0, False: 286k]
  ------------------
  955|   286k|            src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
  956|   286k|            rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth,
  957|   286k|            1) != 0) {
  958|      0|      aom_internal_error(
  959|      0|          error_info, AOM_CODEC_MEM_ERROR,
  960|      0|          "Error allocating buffer in av1_apply_selfguided_restoration");
  961|      0|    }
  962|   286k|  }
  963|  93.8k|}
restoration.c:get_stripe_boundary_info:
  221|   271k|                                     int *copy_above, int *copy_below) {
  222|   271k|  (void)plane_w;
  223|       |
  224|   271k|  *copy_above = 1;
  225|   271k|  *copy_below = 1;
  226|       |
  227|   271k|  const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
  ------------------
  |  |   34|   271k|#define RESTORATION_PROC_UNIT_SIZE 64
  ------------------
  228|   271k|  const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
  ------------------
  |  |   37|   271k|#define RESTORATION_UNIT_OFFSET 8
  ------------------
  229|       |
  230|   271k|  const int first_stripe_in_plane = (limits->v_start == 0);
  231|   271k|  const int this_stripe_height =
  232|   271k|      full_stripe_height - (first_stripe_in_plane ? runit_offset : 0);
  ------------------
  |  Branch (232:29): [True: 37.3k, False: 234k]
  ------------------
  233|   271k|  const int last_stripe_in_plane =
  234|   271k|      (limits->v_start + this_stripe_height >= plane_h);
  235|       |
  236|   271k|  if (first_stripe_in_plane) *copy_above = 0;
  ------------------
  |  Branch (236:7): [True: 37.3k, False: 234k]
  ------------------
  237|   271k|  if (last_stripe_in_plane) *copy_below = 0;
  ------------------
  |  Branch (237:7): [True: 38.7k, False: 233k]
  ------------------
  238|   271k|}
restoration.c:setup_processing_stripe_boundary:
  252|   271k|    RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) {
  253|       |  // Offsets within the line buffers. The buffer logically starts at column
  254|       |  // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
  255|       |  // has column x0 in the buffer.
  256|   271k|  const int buf_stride = rsb->stripe_boundary_stride;
  257|   271k|  const int buf_x0_off = limits->h_start;
  258|   271k|  const int line_width =
  259|   271k|      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
  ------------------
  |  |   70|   271k|#define RESTORATION_EXTRA_HORZ 4
  ------------------
  260|   271k|  const int line_size = line_width << use_highbd;
  261|       |
  262|   271k|  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
  ------------------
  |  |   70|   271k|#define RESTORATION_EXTRA_HORZ 4
  ------------------
  263|       |
  264|       |  // Replace RESTORATION_BORDER pixels above the top of the stripe
  265|       |  // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
  266|       |  // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
  267|       |  // duplicating the topmost of the 2 lines (see the AOMMAX call when
  268|       |  // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
  269|   271k|  if (!opt) {
  ------------------
  |  Branch (269:7): [True: 248k, False: 23.4k]
  ------------------
  270|   248k|    if (copy_above) {
  ------------------
  |  Branch (270:9): [True: 219k, False: 29.2k]
  ------------------
  271|   219k|      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
  272|       |
  273|   876k|      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
  ------------------
  |  |   62|   219k|#define RESTORATION_BORDER 3
  ------------------
  |  Branch (273:41): [True: 656k, False: 219k]
  ------------------
  274|   656k|        const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
  ------------------
  |  |   35|   656k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 219k, False: 437k]
  |  |  ------------------
  ------------------
  275|   656k|        const int buf_off = buf_x0_off + buf_row * buf_stride;
  276|   656k|        const uint8_t *buf =
  277|   656k|            rsb->stripe_boundary_above + (buf_off << use_highbd);
  278|   656k|        uint8_t *dst8 = data8_tl + i * data_stride;
  279|       |        // Save old pixels, then replace with data from stripe_boundary_above
  280|   656k|        memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
  ------------------
  |  |   62|   656k|#define RESTORATION_BORDER 3
  ------------------
  281|   656k|               REAL_PTR(use_highbd, dst8), line_size);
  ------------------
  |  |  205|   656k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|   337k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 337k, False: 319k]
  |  |  ------------------
  ------------------
  282|   656k|        memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
  ------------------
  |  |  205|   656k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|   337k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 337k, False: 319k]
  |  |  ------------------
  ------------------
  283|   656k|      }
  284|   219k|    }
  285|       |
  286|       |    // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
  287|       |    // The second buffer row is repeated, so src_row gets the values 0, 1, 1
  288|       |    // for i = 0, 1, 2.
  289|   248k|    if (copy_below) {
  ------------------
  |  Branch (289:9): [True: 217k, False: 30.6k]
  ------------------
  290|   217k|      const int stripe_end = limits->v_start + h;
  291|   217k|      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
  292|       |
  293|   870k|      for (int i = 0; i < RESTORATION_BORDER; ++i) {
  ------------------
  |  |   62|   870k|#define RESTORATION_BORDER 3
  ------------------
  |  Branch (293:23): [True: 653k, False: 217k]
  ------------------
  294|   653k|        const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
  ------------------
  |  |   34|   653k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 217k, False: 435k]
  |  |  ------------------
  ------------------
  295|   653k|        const int buf_off = buf_x0_off + buf_row * buf_stride;
  296|   653k|        const uint8_t *src =
  297|   653k|            rsb->stripe_boundary_below + (buf_off << use_highbd);
  298|       |
  299|   653k|        uint8_t *dst8 = data8_bl + i * data_stride;
  300|       |        // Save old pixels, then replace with data from stripe_boundary_below
  301|   653k|        memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
  ------------------
  |  |  205|   653k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|   335k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 335k, False: 317k]
  |  |  ------------------
  ------------------
  302|   653k|        memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
  ------------------
  |  |  205|   653k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|   335k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 335k, False: 317k]
  |  |  ------------------
  ------------------
  303|   653k|      }
  304|   217k|    }
  305|   248k|  } else {
  306|  23.4k|    if (copy_above) {
  ------------------
  |  Branch (306:9): [True: 15.3k, False: 8.13k]
  ------------------
  307|  15.3k|      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
  308|       |
  309|       |      // Only save and overwrite i=-RESTORATION_BORDER line.
  310|  15.3k|      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
  ------------------
  |  |   62|  15.3k|#define RESTORATION_BORDER 3
  ------------------
  311|       |      // Save old pixels, then replace with data from stripe_boundary_above
  312|  15.3k|      memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
  ------------------
  |  |  205|  15.3k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|  12.2k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 12.2k, False: 3.04k]
  |  |  ------------------
  ------------------
  313|  15.3k|      memcpy(REAL_PTR(use_highbd, dst8),
  ------------------
  |  |  205|  15.3k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|  12.2k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 12.2k, False: 3.05k]
  |  |  ------------------
  ------------------
  314|  15.3k|             REAL_PTR(use_highbd,
  ------------------
  |  |  205|  15.3k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|  12.2k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 12.2k, False: 3.05k]
  |  |  ------------------
  ------------------
  315|  15.3k|                      data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
  316|  15.3k|             line_size);
  317|  15.3k|    }
  318|       |
  319|  23.4k|    if (copy_below) {
  ------------------
  |  Branch (319:9): [True: 15.2k, False: 8.19k]
  ------------------
  320|  15.2k|      const int stripe_end = limits->v_start + h;
  321|  15.2k|      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
  322|       |
  323|       |      // Only save and overwrite i=2 line.
  324|  15.2k|      uint8_t *dst8 = data8_bl + 2 * data_stride;
  325|       |      // Save old pixels, then replace with data from stripe_boundary_below
  326|  15.2k|      memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
  ------------------
  |  |  205|  15.2k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|  12.2k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 12.2k, False: 3.00k]
  |  |  ------------------
  ------------------
  327|  15.2k|      memcpy(REAL_PTR(use_highbd, dst8),
  ------------------
  |  |  205|  15.2k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|  12.2k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 12.2k, False: 3.00k]
  |  |  ------------------
  ------------------
  328|  15.2k|             REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
  ------------------
  |  |  205|  15.2k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|  12.2k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 12.2k, False: 3.00k]
  |  |  ------------------
  ------------------
  329|  15.2k|    }
  330|  23.4k|  }
  331|   271k|}
restoration.c:restore_processing_stripe_boundary:
  339|   271k|    int copy_below, int opt) {
  340|   271k|  const int line_width =
  341|   271k|      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
  ------------------
  |  |   70|   271k|#define RESTORATION_EXTRA_HORZ 4
  ------------------
  342|   271k|  const int line_size = line_width << use_highbd;
  343|       |
  344|   271k|  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
  ------------------
  |  |   70|   271k|#define RESTORATION_EXTRA_HORZ 4
  ------------------
  345|       |
  346|   271k|  if (!opt) {
  ------------------
  |  Branch (346:7): [True: 248k, False: 23.4k]
  ------------------
  347|   248k|    if (copy_above) {
  ------------------
  |  Branch (347:9): [True: 218k, False: 29.2k]
  ------------------
  348|   218k|      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
  349|   875k|      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
  ------------------
  |  |   62|   218k|#define RESTORATION_BORDER 3
  ------------------
  |  Branch (349:41): [True: 656k, False: 218k]
  ------------------
  350|   656k|        uint8_t *dst8 = data8_tl + i * data_stride;
  351|   656k|        memcpy(REAL_PTR(use_highbd, dst8),
  ------------------
  |  |  205|   656k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|   336k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 336k, False: 319k]
  |  |  ------------------
  ------------------
  352|   656k|               rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
  ------------------
  |  |   62|   656k|#define RESTORATION_BORDER 3
  ------------------
  353|   656k|      }
  354|   218k|    }
  355|       |
  356|   248k|    if (copy_below) {
  ------------------
  |  Branch (356:9): [True: 217k, False: 30.2k]
  ------------------
  357|   217k|      const int stripe_bottom = limits->v_start + h;
  358|   217k|      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
  359|       |
  360|   871k|      for (int i = 0; i < RESTORATION_BORDER; ++i) {
  ------------------
  |  |   62|   871k|#define RESTORATION_BORDER 3
  ------------------
  |  Branch (360:23): [True: 653k, False: 217k]
  ------------------
  361|   653k|        if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
  ------------------
  |  |   62|   653k|#define RESTORATION_BORDER 3
  ------------------
  |  Branch (361:13): [True: 0, False: 653k]
  ------------------
  362|       |
  363|   653k|        uint8_t *dst8 = data8_bl + i * data_stride;
  364|   653k|        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
  ------------------
  |  |  205|   653k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|   335k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 335k, False: 318k]
  |  |  ------------------
  ------------------
  365|   653k|      }
  366|   217k|    }
  367|   248k|  } else {
  368|  23.4k|    if (copy_above) {
  ------------------
  |  Branch (368:9): [True: 15.2k, False: 8.12k]
  ------------------
  369|  15.2k|      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
  370|       |
  371|       |      // Only restore i=-RESTORATION_BORDER line.
  372|  15.2k|      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
  ------------------
  |  |   62|  15.2k|#define RESTORATION_BORDER 3
  ------------------
  373|  15.2k|      memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
  ------------------
  |  |  205|  15.2k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|  12.2k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 12.2k, False: 3.04k]
  |  |  ------------------
  ------------------
  374|  15.2k|    }
  375|       |
  376|  23.4k|    if (copy_below) {
  ------------------
  |  Branch (376:9): [True: 15.2k, False: 8.18k]
  ------------------
  377|  15.2k|      const int stripe_bottom = limits->v_start + h;
  378|  15.2k|      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
  379|       |
  380|       |      // Only restore i=2 line.
  381|  15.2k|      if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
  ------------------
  |  |   62|  15.2k|#define RESTORATION_BORDER 3
  ------------------
  |  Branch (381:11): [True: 15.2k, False: 0]
  ------------------
  382|  15.2k|        uint8_t *dst8 = data8_bl + 2 * data_stride;
  383|  15.2k|        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
  ------------------
  |  |  205|  15.2k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|  12.2k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 12.2k, False: 3.00k]
  |  |  ------------------
  ------------------
  384|  15.2k|      }
  385|  15.2k|    }
  386|  23.4k|  }
  387|   271k|}
restoration.c:filter_frame_on_unit:
 1060|   121k|                                 struct aom_internal_error_info *error_info) {
 1061|   121k|  FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
 1062|   121k|  const RestorationInfo *rsi = ctxt->rsi;
 1063|       |
 1064|   121k|  av1_loop_restoration_filter_unit(
 1065|   121k|      limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs,
 1066|   121k|      ctxt->plane_w, ctxt->plane_h, ctxt->ss_x, ctxt->ss_y, ctxt->highbd,
 1067|   121k|      ctxt->bit_depth, ctxt->data8, ctxt->data_stride, ctxt->dst8,
 1068|   121k|      ctxt->dst_stride, tmpbuf, rsi->optimized_lr, error_info);
 1069|   121k|}
restoration.c:foreach_rest_unit_in_planes:
 1184|  6.40k|                                        int num_planes) {
 1185|  6.40k|  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
 1186|       |
 1187|  20.7k|  for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (1187:23): [True: 14.3k, False: 6.40k]
  ------------------
 1188|  14.3k|    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) {
  ------------------
  |  Branch (1188:9): [True: 4.15k, False: 10.1k]
  ------------------
 1189|  4.15k|      continue;
 1190|  4.15k|    }
 1191|       |
 1192|  10.1k|    foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit, &ctxt[plane],
 1193|  10.1k|                               cm->rst_tmpbuf, cm->rlbs);
 1194|  10.1k|  }
 1195|  6.40k|}
restoration.c:foreach_rest_unit_in_plane:
 1147|  10.1k|                                       RestorationLineBuffers *rlbs) {
 1148|  10.1k|  const RestorationInfo *rsi = &cm->rst_info[plane];
 1149|  10.1k|  const int hnum_rest_units = rsi->horz_units;
 1150|  10.1k|  const int vnum_rest_units = rsi->vert_units;
 1151|  10.1k|  const int unit_size = rsi->restoration_unit_size;
 1152|       |
 1153|  10.1k|  const int is_uv = plane > 0;
 1154|  10.1k|  const int ss_y = is_uv && cm->seq_params->subsampling_y;
  ------------------
  |  Branch (1154:20): [True: 5.02k, False: 5.13k]
  |  Branch (1154:29): [True: 3.76k, False: 1.25k]
  ------------------
 1155|  10.1k|  const int ext_size = unit_size * 3 / 2;
 1156|  10.1k|  int plane_w, plane_h;
 1157|  10.1k|  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
 1158|       |
 1159|  10.1k|  int y0 = 0, i = 0;
 1160|  23.3k|  while (y0 < plane_h) {
  ------------------
  |  Branch (1160:10): [True: 13.1k, False: 10.1k]
  ------------------
 1161|  13.1k|    int remaining_h = plane_h - y0;
 1162|  13.1k|    int h = (remaining_h < ext_size) ? remaining_h : unit_size;
  ------------------
  |  Branch (1162:13): [True: 10.1k, False: 3.02k]
  ------------------
 1163|       |
 1164|  13.1k|    RestorationTileLimits limits;
 1165|  13.1k|    limits.v_start = y0;
 1166|  13.1k|    limits.v_end = y0 + h;
 1167|  13.1k|    assert(limits.v_end <= plane_h);
 1168|       |    // Offset upwards to align with the restoration processing stripe
 1169|  13.1k|    const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
  ------------------
  |  |   37|  13.1k|#define RESTORATION_UNIT_OFFSET 8
  ------------------
 1170|  13.1k|    limits.v_start = AOMMAX(0, limits.v_start - voffset);
  ------------------
  |  |   35|  13.1k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 10.1k, False: 3.02k]
  |  |  ------------------
  ------------------
 1171|  13.1k|    if (limits.v_end < plane_h) limits.v_end -= voffset;
  ------------------
  |  Branch (1171:9): [True: 3.02k, False: 10.1k]
  ------------------
 1172|       |
 1173|  13.1k|    av1_foreach_rest_unit_in_row(&limits, plane_w, on_rest_unit, i, unit_size,
 1174|  13.1k|                                 hnum_rest_units, vnum_rest_units, plane, priv,
 1175|  13.1k|                                 tmpbuf, rlbs, av1_lr_sync_read_dummy,
 1176|  13.1k|                                 av1_lr_sync_write_dummy, NULL, cm->error);
 1177|       |
 1178|  13.1k|    y0 += h;
 1179|  13.1k|    ++i;
 1180|  13.1k|  }
 1181|  10.1k|}
restoration.c:loop_restoration_copy_planes:
 1127|  6.40k|                                         AV1_COMMON *cm, int num_planes) {
 1128|  6.40k|  typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
 1129|  6.40k|                           YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
 1130|  6.40k|                           int vstart, int vend);
 1131|  6.40k|  static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y,
  ------------------
  |  |   58|  6.40k|#define aom_yv12_partial_coloc_copy_y aom_yv12_partial_coloc_copy_y_c
  ------------------
 1132|  6.40k|                                         aom_yv12_partial_coloc_copy_u,
  ------------------
  |  |   52|  6.40k|#define aom_yv12_partial_coloc_copy_u aom_yv12_partial_coloc_copy_u_c
  ------------------
 1133|  6.40k|                                         aom_yv12_partial_coloc_copy_v };
  ------------------
  |  |   55|  6.40k|#define aom_yv12_partial_coloc_copy_v aom_yv12_partial_coloc_copy_v_c
  ------------------
 1134|  6.40k|  assert(num_planes <= 3);
 1135|  20.7k|  for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (1135:23): [True: 14.3k, False: 6.40k]
  ------------------
 1136|  14.3k|    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
  ------------------
  |  Branch (1136:9): [True: 4.15k, False: 10.1k]
  ------------------
 1137|  10.1k|    FilterFrameCtxt *lr_plane_ctxt = &loop_rest_ctxt->ctxt[plane];
 1138|  10.1k|    copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, 0,
 1139|  10.1k|                     lr_plane_ctxt->plane_w, 0, lr_plane_ctxt->plane_h);
 1140|  10.1k|  }
 1141|  6.40k|}
restoration.c:save_boundary_lines:
 1443|   103k|                                int plane, AV1_COMMON *cm, int after_cdef) {
 1444|   103k|  const int is_uv = plane > 0;
 1445|   103k|  const int ss_y = is_uv && cm->seq_params->subsampling_y;
  ------------------
  |  Branch (1445:20): [True: 64.3k, False: 38.8k]
  |  Branch (1445:29): [True: 53.0k, False: 11.2k]
  ------------------
 1446|   103k|  const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
  ------------------
  |  |   34|   103k|#define RESTORATION_PROC_UNIT_SIZE 64
  ------------------
 1447|   103k|  const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
  ------------------
  |  |   37|   103k|#define RESTORATION_UNIT_OFFSET 8
  ------------------
 1448|       |
 1449|   103k|  int plane_w, plane_h;
 1450|   103k|  av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
 1451|       |
 1452|   103k|  RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries;
 1453|       |
 1454|   103k|  const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y);
  ------------------
  |  |   41|   103k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
 1455|       |
 1456|   103k|  int stripe_idx;
 1457|   485k|  for (stripe_idx = 0;; ++stripe_idx) {
 1458|   485k|    const int rel_y0 = AOMMAX(0, stripe_idx * stripe_height - stripe_off);
  ------------------
  |  |   35|   485k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 103k, False: 382k]
  |  |  ------------------
  ------------------
 1459|   485k|    const int y0 = rel_y0;
 1460|   485k|    if (y0 >= plane_h) break;
  ------------------
  |  Branch (1460:9): [True: 103k, False: 382k]
  ------------------
 1461|       |
 1462|   382k|    const int rel_y1 = (stripe_idx + 1) * stripe_height - stripe_off;
 1463|   382k|    const int y1 = AOMMIN(rel_y1, plane_h);
  ------------------
  |  |   34|   382k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 279k, False: 103k]
  |  |  ------------------
  ------------------
 1464|       |
 1465|       |    // Extend using CDEF pixels at the top and bottom of the frame,
 1466|       |    // and deblocked pixels at internal stripe boundaries
 1467|   382k|    const int use_deblock_above = (stripe_idx > 0);
 1468|   382k|    const int use_deblock_below = (y1 < plane_height);
 1469|       |
 1470|   382k|    if (!after_cdef) {
  ------------------
  |  Branch (1470:9): [True: 191k, False: 191k]
  ------------------
 1471|       |      // Save deblocked context at internal stripe boundaries
 1472|   191k|      if (use_deblock_above) {
  ------------------
  |  Branch (1472:11): [True: 139k, False: 51.5k]
  ------------------
 1473|   139k|        save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT,
  ------------------
  |  |   66|   139k|#define RESTORATION_CTX_VERT 2
  ------------------
 1474|   139k|                                    stripe_idx, use_highbd, 1, boundaries);
 1475|   139k|      }
 1476|   191k|      if (use_deblock_below) {
  ------------------
  |  Branch (1476:11): [True: 139k, False: 51.5k]
  ------------------
 1477|   139k|        save_deblock_boundary_lines(frame, cm, plane, y1, stripe_idx,
 1478|   139k|                                    use_highbd, 0, boundaries);
 1479|   139k|      }
 1480|   191k|    } else {
 1481|       |      // Save CDEF context at frame boundaries
 1482|   191k|      if (!use_deblock_above) {
  ------------------
  |  Branch (1482:11): [True: 51.5k, False: 139k]
  ------------------
 1483|  51.5k|        save_cdef_boundary_lines(frame, cm, plane, y0, stripe_idx, use_highbd,
 1484|  51.5k|                                 1, boundaries);
 1485|  51.5k|      }
 1486|   191k|      if (!use_deblock_below) {
  ------------------
  |  Branch (1486:11): [True: 51.5k, False: 139k]
  ------------------
 1487|  51.5k|        save_cdef_boundary_lines(frame, cm, plane, y1 - 1, stripe_idx,
 1488|  51.5k|                                 use_highbd, 0, boundaries);
 1489|  51.5k|      }
 1490|   191k|    }
 1491|   382k|  }
 1492|   103k|}
restoration.c:save_deblock_boundary_lines:
 1358|   279k|    RestorationStripeBoundaries *boundaries) {
 1359|   279k|  const int is_uv = plane > 0;
 1360|   279k|  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
  ------------------
  |  |  205|   279k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|   154k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 154k, False: 124k]
  |  |  ------------------
  ------------------
 1361|   279k|  const int src_stride = frame->strides[is_uv] << use_highbd;
 1362|   279k|  const uint8_t *src_rows = src_buf + row * (ptrdiff_t)src_stride;
 1363|       |
 1364|   279k|  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
  ------------------
  |  Branch (1364:23): [True: 139k, False: 139k]
  ------------------
 1365|   279k|                               : boundaries->stripe_boundary_below;
 1366|   279k|  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
  ------------------
  |  |   70|   279k|#define RESTORATION_EXTRA_HORZ 4
  ------------------
 1367|   279k|  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
 1368|   279k|  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
  ------------------
  |  |   66|   279k|#define RESTORATION_CTX_VERT 2
  ------------------
 1369|       |
 1370|       |  // There is a rare case in which a processing stripe can end 1px above the
 1371|       |  // crop border. In this case, we do want to use deblocked pixels from below
 1372|       |  // the stripe (hence why we ended up in this function), but instead of
 1373|       |  // fetching 2 "below" rows we need to fetch one and duplicate it.
 1374|       |  // This is equivalent to clamping the sample locations against the crop border
 1375|   279k|  const int lines_to_save =
 1376|   279k|      AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row);
  ------------------
  |  |   34|   279k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 269k, False: 9.38k]
  |  |  ------------------
  ------------------
 1377|   279k|  assert(lines_to_save == 1 || lines_to_save == 2);
 1378|       |
 1379|   279k|  int upscaled_width;
 1380|   279k|  int line_bytes;
 1381|   279k|  if (av1_superres_scaled(cm)) {
  ------------------
  |  Branch (1381:7): [True: 24.1k, False: 255k]
  ------------------
 1382|  24.1k|    const int ss_x = is_uv && cm->seq_params->subsampling_x;
  ------------------
  |  Branch (1382:22): [True: 11.7k, False: 12.3k]
  |  Branch (1382:31): [True: 9.70k, False: 2.06k]
  ------------------
 1383|  24.1k|    upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
 1384|  24.1k|    line_bytes = upscaled_width << use_highbd;
 1385|  24.1k|    if (use_highbd)
  ------------------
  |  Branch (1385:9): [True: 11.6k, False: 12.4k]
  ------------------
 1386|  11.6k|      av1_upscale_normative_rows(
 1387|  11.6k|          cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv],
  ------------------
  |  |   76|  11.6k|#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
  ------------------
 1388|  11.6k|          CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride,
  ------------------
  |  |   76|  11.6k|#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
  ------------------
 1389|  11.6k|          plane, lines_to_save);
 1390|  12.4k|    else
 1391|  12.4k|      av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows,
 1392|  12.4k|                                 boundaries->stripe_boundary_stride, plane,
 1393|  12.4k|                                 lines_to_save);
 1394|   255k|  } else {
 1395|   255k|    upscaled_width = frame->crop_widths[is_uv];
 1396|   255k|    line_bytes = upscaled_width << use_highbd;
 1397|   760k|    for (int i = 0; i < lines_to_save; i++) {
  ------------------
  |  Branch (1397:21): [True: 505k, False: 255k]
  ------------------
 1398|   505k|      memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride,
 1399|   505k|             line_bytes);
 1400|   505k|    }
 1401|   255k|  }
 1402|       |  // If we only saved one line, then copy it into the second line buffer
 1403|   279k|  if (lines_to_save == 1)
  ------------------
  |  Branch (1403:7): [True: 6.19k, False: 273k]
  ------------------
 1404|  6.19k|    memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
 1405|       |
 1406|   279k|  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
  ------------------
  |  |   66|   279k|#define RESTORATION_CTX_VERT 2
  ------------------
 1407|   279k|               RESTORATION_EXTRA_HORZ, use_highbd);
  ------------------
  |  |   70|   279k|#define RESTORATION_EXTRA_HORZ 4
  ------------------
 1408|   279k|}
restoration.c:extend_lines:
 1341|   382k|                         int extend, int use_highbitdepth) {
 1342|  1.14M|  for (int i = 0; i < height; ++i) {
  ------------------
  |  Branch (1342:19): [True: 764k, False: 382k]
  ------------------
 1343|   764k|    if (use_highbitdepth) {
  ------------------
  |  Branch (1343:9): [True: 424k, False: 340k]
  ------------------
 1344|   424k|      uint16_t *buf16 = (uint16_t *)buf;
 1345|   424k|      aom_memset16(buf16 - extend, buf16[0], extend);
 1346|   424k|      aom_memset16(buf16 + width, buf16[width - 1], extend);
 1347|   424k|    } else {
 1348|   340k|      memset(buf - extend, buf[0], extend);
 1349|   340k|      memset(buf + width, buf[width - 1], extend);
 1350|   340k|    }
 1351|   764k|    buf += stride;
 1352|   764k|  }
 1353|   382k|}
restoration.c:save_cdef_boundary_lines:
 1413|   103k|                                     RestorationStripeBoundaries *boundaries) {
 1414|   103k|  const int is_uv = plane > 0;
 1415|   103k|  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
  ------------------
  |  |  205|   103k|#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
  |  |  ------------------
  |  |  |  |   75|  57.1k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  |  |  ------------------
  |  |  |  Branch (205:27): [True: 57.1k, False: 46.0k]
  |  |  ------------------
  ------------------
 1416|   103k|  const int src_stride = frame->strides[is_uv] << use_highbd;
 1417|   103k|  const uint8_t *src_rows = src_buf + row * (ptrdiff_t)src_stride;
 1418|       |
 1419|   103k|  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
  ------------------
  |  Branch (1419:23): [True: 51.5k, False: 51.5k]
  ------------------
 1420|   103k|                               : boundaries->stripe_boundary_below;
 1421|   103k|  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
  ------------------
  |  |   70|   103k|#define RESTORATION_EXTRA_HORZ 4
  ------------------
 1422|   103k|  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
 1423|   103k|  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
  ------------------
  |  |   66|   103k|#define RESTORATION_CTX_VERT 2
  ------------------
 1424|   103k|  const int src_width = frame->crop_widths[is_uv];
 1425|       |
 1426|       |  // At the point where this function is called, we've already applied
 1427|       |  // superres. So we don't need to extend the lines here, we can just
 1428|       |  // pull directly from the topmost row of the upscaled frame.
 1429|   103k|  const int ss_x = is_uv && cm->seq_params->subsampling_x;
  ------------------
  |  Branch (1429:20): [True: 64.3k, False: 38.8k]
  |  Branch (1429:29): [True: 61.6k, False: 2.65k]
  ------------------
 1430|   103k|  const int upscaled_width = av1_superres_scaled(cm)
  ------------------
  |  Branch (1430:30): [True: 13.2k, False: 89.8k]
  ------------------
 1431|   103k|                                 ? (cm->superres_upscaled_width + ss_x) >> ss_x
 1432|   103k|                                 : src_width;
 1433|   103k|  const int line_bytes = upscaled_width << use_highbd;
 1434|   309k|  for (int i = 0; i < RESTORATION_CTX_VERT; i++) {
  ------------------
  |  |   66|   309k|#define RESTORATION_CTX_VERT 2
  ------------------
  |  Branch (1434:19): [True: 206k, False: 103k]
  ------------------
 1435|       |    // Copy the line at 'src_rows' into both context lines
 1436|   206k|    memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
 1437|   206k|  }
 1438|   103k|  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
  ------------------
  |  |   66|   103k|#define RESTORATION_CTX_VERT 2
  ------------------
 1439|   103k|               RESTORATION_EXTRA_HORZ, use_highbd);
  ------------------
  |  |   70|   103k|#define RESTORATION_EXTRA_HORZ 4
  ------------------
 1440|   103k|}

blockd.c:set_default_wiener:
  302|   450k|static inline void set_default_wiener(WienerInfo *wiener_info) {
  303|   450k|  wiener_info->vfilter[0] = wiener_info->hfilter[0] = WIENER_FILT_TAP0_MIDV;
  ------------------
  |  |  137|   450k|#define WIENER_FILT_TAP0_MIDV (3)
  ------------------
  304|   450k|  wiener_info->vfilter[1] = wiener_info->hfilter[1] = WIENER_FILT_TAP1_MIDV;
  ------------------
  |  |  138|   450k|#define WIENER_FILT_TAP1_MIDV (-7)
  ------------------
  305|   450k|  wiener_info->vfilter[2] = wiener_info->hfilter[2] = WIENER_FILT_TAP2_MIDV;
  ------------------
  |  |  139|   450k|#define WIENER_FILT_TAP2_MIDV (15)
  ------------------
  306|   450k|  wiener_info->vfilter[WIENER_HALFWIN] = wiener_info->hfilter[WIENER_HALFWIN] =
  ------------------
  |  |   43|   450k|#define WIENER_HALFWIN 3
  ------------------
                wiener_info->vfilter[WIENER_HALFWIN] = wiener_info->hfilter[WIENER_HALFWIN] =
  ------------------
  |  |   43|   450k|#define WIENER_HALFWIN 3
  ------------------
  307|   450k|      -2 *
  308|   450k|      (WIENER_FILT_TAP2_MIDV + WIENER_FILT_TAP1_MIDV + WIENER_FILT_TAP0_MIDV);
  ------------------
  |  |  139|   450k|#define WIENER_FILT_TAP2_MIDV (15)
  ------------------
                    (WIENER_FILT_TAP2_MIDV + WIENER_FILT_TAP1_MIDV + WIENER_FILT_TAP0_MIDV);
  ------------------
  |  |  138|   450k|#define WIENER_FILT_TAP1_MIDV (-7)
  ------------------
                    (WIENER_FILT_TAP2_MIDV + WIENER_FILT_TAP1_MIDV + WIENER_FILT_TAP0_MIDV);
  ------------------
  |  |  137|   450k|#define WIENER_FILT_TAP0_MIDV (3)
  ------------------
  309|   450k|  wiener_info->vfilter[4] = wiener_info->hfilter[4] = WIENER_FILT_TAP2_MIDV;
  ------------------
  |  |  139|   450k|#define WIENER_FILT_TAP2_MIDV (15)
  ------------------
  310|   450k|  wiener_info->vfilter[5] = wiener_info->hfilter[5] = WIENER_FILT_TAP1_MIDV;
  ------------------
  |  |  138|   450k|#define WIENER_FILT_TAP1_MIDV (-7)
  ------------------
  311|   450k|  wiener_info->vfilter[6] = wiener_info->hfilter[6] = WIENER_FILT_TAP0_MIDV;
  ------------------
  |  |  137|   450k|#define WIENER_FILT_TAP0_MIDV (3)
  ------------------
  312|   450k|}
blockd.c:set_default_sgrproj:
  297|   450k|static inline void set_default_sgrproj(SgrprojInfo *sgrproj_info) {
  298|   450k|  sgrproj_info->xqd[0] = (SGRPROJ_PRJ_MIN0 + SGRPROJ_PRJ_MAX0) / 2;
  ------------------
  |  |  106|   450k|#define SGRPROJ_PRJ_MIN0 (-(1 << SGRPROJ_PRJ_BITS) * 3 / 4)
  |  |  ------------------
  |  |  |  |   99|   450k|#define SGRPROJ_PRJ_BITS 7
  |  |  ------------------
  ------------------
                sgrproj_info->xqd[0] = (SGRPROJ_PRJ_MIN0 + SGRPROJ_PRJ_MAX0) / 2;
  ------------------
  |  |  107|   450k|#define SGRPROJ_PRJ_MAX0 (SGRPROJ_PRJ_MIN0 + (1 << SGRPROJ_PRJ_BITS) - 1)
  |  |  ------------------
  |  |  |  |  106|   450k|#define SGRPROJ_PRJ_MIN0 (-(1 << SGRPROJ_PRJ_BITS) * 3 / 4)
  |  |  |  |  ------------------
  |  |  |  |  |  |   99|   450k|#define SGRPROJ_PRJ_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |               #define SGRPROJ_PRJ_MAX0 (SGRPROJ_PRJ_MIN0 + (1 << SGRPROJ_PRJ_BITS) - 1)
  |  |  ------------------
  |  |  |  |   99|   450k|#define SGRPROJ_PRJ_BITS 7
  |  |  ------------------
  ------------------
  299|   450k|  sgrproj_info->xqd[1] = (SGRPROJ_PRJ_MIN1 + SGRPROJ_PRJ_MAX1) / 2;
  ------------------
  |  |  108|   450k|#define SGRPROJ_PRJ_MIN1 (-(1 << SGRPROJ_PRJ_BITS) / 4)
  |  |  ------------------
  |  |  |  |   99|   450k|#define SGRPROJ_PRJ_BITS 7
  |  |  ------------------
  ------------------
                sgrproj_info->xqd[1] = (SGRPROJ_PRJ_MIN1 + SGRPROJ_PRJ_MAX1) / 2;
  ------------------
  |  |  109|   450k|#define SGRPROJ_PRJ_MAX1 (SGRPROJ_PRJ_MIN1 + (1 << SGRPROJ_PRJ_BITS) - 1)
  |  |  ------------------
  |  |  |  |  108|   450k|#define SGRPROJ_PRJ_MIN1 (-(1 << SGRPROJ_PRJ_BITS) / 4)
  |  |  |  |  ------------------
  |  |  |  |  |  |   99|   450k|#define SGRPROJ_PRJ_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |               #define SGRPROJ_PRJ_MAX1 (SGRPROJ_PRJ_MIN1 + (1 << SGRPROJ_PRJ_BITS) - 1)
  |  |  ------------------
  |  |  |  |   99|   450k|#define SGRPROJ_PRJ_BITS 7
  |  |  ------------------
  ------------------
  300|   450k|}

av1_scale_mv:
   34|  2.31M|                  const struct scale_factors *sf) {
   35|  2.31M|  const int x_off_q4 = av1_scaled_x(x << SUBPEL_BITS, sf);
  ------------------
  |  |   23|  2.31M|#define SUBPEL_BITS 4
  ------------------
   36|  2.31M|  const int y_off_q4 = av1_scaled_y(y << SUBPEL_BITS, sf);
  ------------------
  |  |   23|  2.31M|#define SUBPEL_BITS 4
  ------------------
   37|  2.31M|  const MV32 res = {
   38|  2.31M|    av1_scaled_y((y << SUBPEL_BITS) + mvq4->row, sf) - y_off_q4,
  ------------------
  |  |   23|  2.31M|#define SUBPEL_BITS 4
  ------------------
   39|  2.31M|    av1_scaled_x((x << SUBPEL_BITS) + mvq4->col, sf) - x_off_q4
  ------------------
  |  |   23|  2.31M|#define SUBPEL_BITS 4
  ------------------
   40|  2.31M|  };
   41|  2.31M|  return res;
   42|  2.31M|}
av1_setup_scale_factors_for_frame:
   45|   385k|                                       int other_h, int this_w, int this_h) {
   46|   385k|  if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) {
  ------------------
  |  Branch (46:7): [True: 6.81k, False: 378k]
  ------------------
   47|  6.81k|    sf->x_scale_fp = REF_INVALID_SCALE;
  ------------------
  |  |   26|  6.81k|#define REF_INVALID_SCALE -1
  ------------------
   48|  6.81k|    sf->y_scale_fp = REF_INVALID_SCALE;
  ------------------
  |  |   26|  6.81k|#define REF_INVALID_SCALE -1
  ------------------
   49|  6.81k|    return;
   50|  6.81k|  }
   51|       |
   52|   378k|  sf->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
   53|   378k|  sf->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
   54|       |
   55|   378k|  sf->x_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->x_scale_fp);
   56|   378k|  sf->y_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->y_scale_fp);
   57|   378k|}
scale.c:get_fixed_point_scale_factor:
   19|   756k|static int get_fixed_point_scale_factor(int other_size, int this_size) {
   20|       |  // Calculate scaling factor once for each reference frame
   21|       |  // and use fixed point scaling factors in decoding and encoding routines.
   22|       |  // Hardware implementations can calculate scale factor in device driver
   23|       |  // and use multiplication and shifting on hardware instead of division.
   24|   756k|  return ((other_size << REF_SCALE_SHIFT) + this_size / 2) / this_size;
  ------------------
  |  |   24|   756k|#define REF_SCALE_SHIFT 14
  ------------------
   25|   756k|}
scale.c:fixed_point_scale_to_coarse_point_scale:
   28|   756k|static int fixed_point_scale_to_coarse_point_scale(int scale_fp) {
   29|   756k|  return ROUND_POWER_OF_TWO(scale_fp, REF_SCALE_SHIFT - SCALE_SUBPEL_BITS);
  ------------------
  |  |   41|   756k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
   30|   756k|}

decodeframe.c:valid_ref_frame_size:
   78|   271k|                                       int this_width, int this_height) {
   79|   271k|  return 2 * this_width >= ref_width && 2 * this_height >= ref_height &&
  ------------------
  |  Branch (79:10): [True: 267k, False: 4.00k]
  |  Branch (79:41): [True: 248k, False: 19.4k]
  ------------------
   80|   271k|         this_width <= 16 * ref_width && this_height <= 16 * ref_height;
  ------------------
  |  Branch (80:10): [True: 234k, False: 13.7k]
  |  Branch (80:42): [True: 232k, False: 2.12k]
  ------------------
   81|   271k|}
decodeframe.c:av1_is_valid_scale:
   64|  34.3M|static inline int av1_is_valid_scale(const struct scale_factors *sf) {
   65|  34.3M|  assert(sf != NULL);
   66|  34.3M|  return sf->x_scale_fp != REF_INVALID_SCALE &&
  ------------------
  |  |   26|  68.7M|#define REF_INVALID_SCALE -1
  ------------------
  |  Branch (66:10): [True: 34.3M, False: 5.38k]
  ------------------
   67|  34.3M|         sf->y_scale_fp != REF_INVALID_SCALE;
  ------------------
  |  |   26|  34.3M|#define REF_INVALID_SCALE -1
  ------------------
  |  Branch (67:10): [True: 34.3M, False: 18.4E]
  ------------------
   68|  34.3M|}
decodeframe.c:av1_is_scaled:
   70|  34.0M|static inline int av1_is_scaled(const struct scale_factors *sf) {
   71|  34.0M|  assert(sf != NULL);
   72|  34.0M|  return av1_is_valid_scale(sf) &&
  ------------------
  |  Branch (72:10): [True: 34.0M, False: 12.8k]
  ------------------
   73|  34.0M|         (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
  ------------------
  |  |   25|  68.0M|#define REF_NO_SCALE (1 << REF_SCALE_SHIFT)
  |  |  ------------------
  |  |  |  |   24|  34.0M|#define REF_SCALE_SHIFT 14
  |  |  ------------------
  ------------------
                       (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
  ------------------
  |  |   25|  29.4M|#define REF_NO_SCALE (1 << REF_SCALE_SHIFT)
  |  |  ------------------
  |  |  |  |   24|  29.4M|#define REF_SCALE_SHIFT 14
  |  |  ------------------
  ------------------
  |  Branch (73:11): [True: 4.60M, False: 29.4M]
  |  Branch (73:45): [True: 8.92k, False: 29.4M]
  ------------------
   74|  34.0M|}
decodeframe.c:av1_scaled_y:
   45|  2.31M|static inline int av1_scaled_y(int val, const struct scale_factors *sf) {
   46|  2.31M|  const int off =
   47|  2.31M|      (sf->y_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
  ------------------
  |  |   24|  2.31M|#define REF_SCALE_SHIFT 14
  ------------------
                    (sf->y_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
  ------------------
  |  |   23|  2.31M|#define SUBPEL_BITS 4
  ------------------
   48|  2.31M|  const int64_t tval = (int64_t)val * sf->y_scale_fp + off;
   49|  2.31M|  return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval,
  ------------------
  |  |   58|  2.31M|  (((value) < 0) ? -ROUND_POWER_OF_TWO_64(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   53|   207k|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (58:4): [True: 207k, False: 2.11M]
  |  |  ------------------
  |  |   59|  2.31M|                 : ROUND_POWER_OF_TWO_64((value), (n)))
  |  |  ------------------
  |  |  |  |   53|  2.11M|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
   50|  2.31M|                                           REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
   51|  2.31M|}
decodeframe.c:av1_scaled_x:
   36|  2.31M|static inline int av1_scaled_x(int val, const struct scale_factors *sf) {
   37|  2.31M|  const int off =
   38|  2.31M|      (sf->x_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
  ------------------
  |  |   24|  2.31M|#define REF_SCALE_SHIFT 14
  ------------------
                    (sf->x_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
  ------------------
  |  |   23|  2.31M|#define SUBPEL_BITS 4
  ------------------
   39|  2.31M|  const int64_t tval = (int64_t)val * sf->x_scale_fp + off;
   40|  2.31M|  return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval,
  ------------------
  |  |   58|  2.31M|  (((value) < 0) ? -ROUND_POWER_OF_TWO_64(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   53|   198k|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (58:4): [True: 198k, False: 2.11M]
  |  |  ------------------
  |  |   59|  2.31M|                 : ROUND_POWER_OF_TWO_64((value), (n)))
  |  |  ------------------
  |  |  |  |   53|  2.11M|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
   41|  2.31M|                                           REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
   42|  2.31M|}
decodemv.c:av1_is_scaled:
   70|  1.46M|static inline int av1_is_scaled(const struct scale_factors *sf) {
   71|  1.46M|  assert(sf != NULL);
   72|  1.46M|  return av1_is_valid_scale(sf) &&
  ------------------
  |  Branch (72:10): [True: 1.46M, False: 10]
  ------------------
   73|  1.46M|         (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
  ------------------
  |  |   25|  2.93M|#define REF_NO_SCALE (1 << REF_SCALE_SHIFT)
  |  |  ------------------
  |  |  |  |   24|  1.46M|#define REF_SCALE_SHIFT 14
  |  |  ------------------
  ------------------
                       (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
  ------------------
  |  |   25|  1.36M|#define REF_NO_SCALE (1 << REF_SCALE_SHIFT)
  |  |  ------------------
  |  |  |  |   24|  1.36M|#define REF_SCALE_SHIFT 14
  |  |  ------------------
  ------------------
  |  Branch (73:11): [True: 104k, False: 1.36M]
  |  Branch (73:45): [True: 49, False: 1.36M]
  ------------------
   74|  1.46M|}
decodemv.c:av1_is_valid_scale:
   64|  1.46M|static inline int av1_is_valid_scale(const struct scale_factors *sf) {
   65|  1.46M|  assert(sf != NULL);
   66|  1.46M|  return sf->x_scale_fp != REF_INVALID_SCALE &&
  ------------------
  |  |   26|  2.93M|#define REF_INVALID_SCALE -1
  ------------------
  |  Branch (66:10): [True: 1.46M, False: 18.4E]
  ------------------
   67|  1.46M|         sf->y_scale_fp != REF_INVALID_SCALE;
  ------------------
  |  |   26|  1.46M|#define REF_INVALID_SCALE -1
  ------------------
  |  Branch (67:10): [True: 1.46M, False: 18.4E]
  ------------------
   68|  1.46M|}
reconinter.c:av1_is_scaled:
   70|  24.1M|static inline int av1_is_scaled(const struct scale_factors *sf) {
   71|  24.1M|  assert(sf != NULL);
   72|  24.1M|  return av1_is_valid_scale(sf) &&
  ------------------
  |  Branch (72:10): [True: 24.1M, False: 3.59k]
  ------------------
   73|  24.1M|         (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
  ------------------
  |  |   25|  48.3M|#define REF_NO_SCALE (1 << REF_SCALE_SHIFT)
  |  |  ------------------
  |  |  |  |   24|  24.1M|#define REF_SCALE_SHIFT 14
  |  |  ------------------
  ------------------
                       (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
  ------------------
  |  |   25|  20.9M|#define REF_NO_SCALE (1 << REF_SCALE_SHIFT)
  |  |  ------------------
  |  |  |  |   24|  20.9M|#define REF_SCALE_SHIFT 14
  |  |  ------------------
  ------------------
  |  Branch (73:11): [True: 3.18M, False: 20.9M]
  |  Branch (73:45): [True: 6.08k, False: 20.9M]
  ------------------
   74|  24.1M|}
reconinter.c:av1_scaled_x:
   36|  2.47M|static inline int av1_scaled_x(int val, const struct scale_factors *sf) {
   37|  2.47M|  const int off =
   38|  2.47M|      (sf->x_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
  ------------------
  |  |   24|  2.47M|#define REF_SCALE_SHIFT 14
  ------------------
                    (sf->x_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
  ------------------
  |  |   23|  2.47M|#define SUBPEL_BITS 4
  ------------------
   39|  2.47M|  const int64_t tval = (int64_t)val * sf->x_scale_fp + off;
   40|  2.47M|  return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval,
  ------------------
  |  |   58|  2.47M|  (((value) < 0) ? -ROUND_POWER_OF_TWO_64(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   53|   282k|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (58:4): [True: 282k, False: 2.19M]
  |  |  ------------------
  |  |   59|  2.47M|                 : ROUND_POWER_OF_TWO_64((value), (n)))
  |  |  ------------------
  |  |  |  |   53|  2.19M|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
   41|  2.47M|                                           REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
   42|  2.47M|}
reconinter.c:av1_scaled_y:
   45|  2.47M|static inline int av1_scaled_y(int val, const struct scale_factors *sf) {
   46|  2.47M|  const int off =
   47|  2.47M|      (sf->y_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
  ------------------
  |  |   24|  2.47M|#define REF_SCALE_SHIFT 14
  ------------------
                    (sf->y_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
  ------------------
  |  |   23|  2.47M|#define SUBPEL_BITS 4
  ------------------
   48|  2.47M|  const int64_t tval = (int64_t)val * sf->y_scale_fp + off;
   49|  2.47M|  return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval,
  ------------------
  |  |   58|  2.47M|  (((value) < 0) ? -ROUND_POWER_OF_TWO_64(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   53|   345k|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (58:4): [True: 345k, False: 2.13M]
  |  |  ------------------
  |  |   59|  2.47M|                 : ROUND_POWER_OF_TWO_64((value), (n)))
  |  |  ------------------
  |  |  |  |   53|  2.13M|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
   50|  2.47M|                                           REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
   51|  2.47M|}
reconinter.c:av1_unscaled_value:
   54|  30.9M|static inline int av1_unscaled_value(int val, const struct scale_factors *sf) {
   55|  30.9M|  (void)sf;
   56|  30.9M|  return val * (1 << SCALE_EXTRA_BITS);
  ------------------
  |  |   31|  30.9M|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|  30.9M|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|  30.9M|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
   57|  30.9M|}
reconinter.c:av1_is_valid_scale:
   64|  25.4M|static inline int av1_is_valid_scale(const struct scale_factors *sf) {
   65|  25.4M|  assert(sf != NULL);
   66|  25.4M|  return sf->x_scale_fp != REF_INVALID_SCALE &&
  ------------------
  |  |   26|  50.8M|#define REF_INVALID_SCALE -1
  ------------------
  |  Branch (66:10): [True: 25.4M, False: 18.4E]
  ------------------
   67|  25.4M|         sf->y_scale_fp != REF_INVALID_SCALE;
  ------------------
  |  |   26|  25.4M|#define REF_INVALID_SCALE -1
  ------------------
  |  Branch (67:10): [True: 25.4M, False: 18.4E]
  ------------------
   68|  25.4M|}
scale.c:av1_scaled_x:
   36|  4.62M|static inline int av1_scaled_x(int val, const struct scale_factors *sf) {
   37|  4.62M|  const int off =
   38|  4.62M|      (sf->x_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
  ------------------
  |  |   24|  4.62M|#define REF_SCALE_SHIFT 14
  ------------------
                    (sf->x_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
  ------------------
  |  |   23|  4.62M|#define SUBPEL_BITS 4
  ------------------
   39|  4.62M|  const int64_t tval = (int64_t)val * sf->x_scale_fp + off;
   40|  4.62M|  return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval,
  ------------------
  |  |   58|  4.62M|  (((value) < 0) ? -ROUND_POWER_OF_TWO_64(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   53|   133k|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (58:4): [True: 133k, False: 4.48M]
  |  |  ------------------
  |  |   59|  4.62M|                 : ROUND_POWER_OF_TWO_64((value), (n)))
  |  |  ------------------
  |  |  |  |   53|  4.48M|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
   41|  4.62M|                                           REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
   42|  4.62M|}
scale.c:av1_scaled_y:
   45|  4.62M|static inline int av1_scaled_y(int val, const struct scale_factors *sf) {
   46|  4.62M|  const int off =
   47|  4.62M|      (sf->y_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
  ------------------
  |  |   24|  4.62M|#define REF_SCALE_SHIFT 14
  ------------------
                    (sf->y_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
  ------------------
  |  |   23|  4.62M|#define SUBPEL_BITS 4
  ------------------
   48|  4.62M|  const int64_t tval = (int64_t)val * sf->y_scale_fp + off;
   49|  4.62M|  return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval,
  ------------------
  |  |   58|  4.62M|  (((value) < 0) ? -ROUND_POWER_OF_TWO_64(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   53|   166k|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (58:4): [True: 166k, False: 4.46M]
  |  |  ------------------
  |  |   59|  4.62M|                 : ROUND_POWER_OF_TWO_64((value), (n)))
  |  |  ------------------
  |  |  |  |   53|  4.46M|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
   50|  4.62M|                                           REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
   51|  4.62M|}
scale.c:valid_ref_frame_size:
   78|   385k|                                       int this_width, int this_height) {
   79|   385k|  return 2 * this_width >= ref_width && 2 * this_height >= ref_height &&
  ------------------
  |  Branch (79:10): [True: 384k, False: 409]
  |  Branch (79:41): [True: 382k, False: 2.65k]
  ------------------
   80|   385k|         this_width <= 16 * ref_width && this_height <= 16 * ref_height;
  ------------------
  |  Branch (80:10): [True: 380k, False: 2.08k]
  |  Branch (80:42): [True: 378k, False: 1.66k]
  ------------------
   81|   385k|}

decodetxb.c:get_scan:
   46|  21.2M|static inline const SCAN_ORDER *get_scan(TX_SIZE tx_size, TX_TYPE tx_type) {
   47|  21.2M|  return get_default_scan(tx_size, tx_type);
   48|  21.2M|}
decodetxb.c:get_default_scan:
   42|  21.4M|                                                 TX_TYPE tx_type) {
   43|  21.4M|  return &av1_scan_orders[tx_size][tx_type];
   44|  21.4M|}

av1_clearall_segfeatures:
   37|   168k|void av1_clearall_segfeatures(struct segmentation *seg) {
   38|   168k|  av1_zero(seg->feature_data);
  ------------------
  |  |   43|   168k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
   39|   168k|  av1_zero(seg->feature_mask);
  ------------------
  |  |   43|   168k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
   40|   168k|}
av1_calculate_segdata:
   42|  16.5k|void av1_calculate_segdata(struct segmentation *seg) {
   43|  16.5k|  seg->segid_preskip = 0;
   44|  16.5k|  seg->last_active_segid = 0;
   45|   148k|  for (int i = 0; i < MAX_SEGMENTS; i++) {
  ------------------
  |  |   21|   148k|#define MAX_SEGMENTS 8
  ------------------
  |  Branch (45:19): [True: 132k, False: 16.5k]
  ------------------
   46|  1.18M|    for (int j = 0; j < SEG_LVL_MAX; j++) {
  ------------------
  |  Branch (46:21): [True: 1.05M, False: 132k]
  ------------------
   47|  1.05M|      if (seg->feature_mask[i] & (1 << j)) {
  ------------------
  |  Branch (47:11): [True: 263k, False: 793k]
  ------------------
   48|   263k|        seg->segid_preskip |= (j >= SEG_LVL_REF_FRAME);
   49|   263k|        seg->last_active_segid = i;
   50|   263k|      }
   51|  1.05M|    }
   52|   132k|  }
   53|  16.5k|}
av1_enable_segfeature:
   56|   278k|                           SEG_LVL_FEATURES feature_id) {
   57|   278k|  seg->feature_mask[segment_id] |= 1 << feature_id;
   58|   278k|}
av1_seg_feature_data_max:
   60|   278k|int av1_seg_feature_data_max(SEG_LVL_FEATURES feature_id) {
   61|   278k|  return seg_feature_data_max[feature_id];
   62|   278k|}
av1_is_segfeature_signed:
   64|   278k|int av1_is_segfeature_signed(SEG_LVL_FEATURES feature_id) {
   65|   278k|  return seg_feature_data_signed[feature_id];
   66|   278k|}
av1_set_segdata:
   80|  1.13M|                     SEG_LVL_FEATURES feature_id, int seg_data) {
   81|  1.13M|  if (seg_data < 0) {
  ------------------
  |  Branch (81:7): [True: 89.8k, False: 1.04M]
  ------------------
   82|  89.8k|    assert(seg_feature_data_signed[feature_id]);
   83|  89.8k|    assert(-seg_data <= seg_feature_data_max[feature_id]);
   84|  1.04M|  } else {
   85|  1.04M|    assert(seg_data <= seg_feature_data_max[feature_id]);
   86|  1.04M|  }
   87|       |
   88|  1.13M|  seg->feature_data[segment_id][feature_id] = seg_data;
   89|  1.13M|}

decodeframe.c:segfeatures_copy:
   68|   193k|                                    const struct segmentation *src) {
   69|   193k|  int i, j;
   70|  1.74M|  for (i = 0; i < MAX_SEGMENTS; i++) {
  ------------------
  |  |   21|  1.74M|#define MAX_SEGMENTS 8
  ------------------
  |  Branch (70:15): [True: 1.54M, False: 193k]
  ------------------
   71|  1.54M|    dst->feature_mask[i] = src->feature_mask[i];
   72|  13.9M|    for (j = 0; j < SEG_LVL_MAX; j++) {
  ------------------
  |  Branch (72:17): [True: 12.3M, False: 1.54M]
  ------------------
   73|  12.3M|      dst->feature_data[i][j] = src->feature_data[i][j];
   74|  12.3M|    }
   75|  1.54M|  }
   76|   193k|  dst->segid_preskip = src->segid_preskip;
   77|   193k|  dst->last_active_segid = src->last_active_segid;
   78|   193k|}
decodemv.c:segfeature_active:
   63|  61.5M|                                    SEG_LVL_FEATURES feature_id) {
   64|  61.5M|  return seg->enabled && (seg->feature_mask[segment_id] & (1 << feature_id));
  ------------------
  |  Branch (64:10): [True: 9.37M, False: 52.1M]
  |  Branch (64:26): [True: 5.56M, False: 3.81M]
  ------------------
   65|  61.5M|}
decodemv.c:set_segment_id:
  101|  4.02M|                                  uint8_t segment_id) {
  102|  4.02M|  segment_ids += mi_offset;
  103|  21.8M|  for (int y = 0; y < y_mis; ++y) {
  ------------------
  |  Branch (103:19): [True: 17.8M, False: 4.02M]
  ------------------
  104|  17.8M|    memset(&segment_ids[y * mi_stride], segment_id,
  105|  17.8M|           x_mis * sizeof(segment_ids[0]));
  106|  17.8M|  }
  107|  4.02M|}
decodemv.c:get_segdata:
   95|   225k|                              SEG_LVL_FEATURES feature_id) {
   96|   225k|  return seg->feature_data[segment_id][feature_id];
   97|   225k|}
av1_loopfilter.c:segfeature_active:
   63|   115M|                                    SEG_LVL_FEATURES feature_id) {
   64|   119M|  return seg->enabled && (seg->feature_mask[segment_id] & (1 << feature_id));
  ------------------
  |  Branch (64:10): [True: 119M, False: 18.4E]
  |  Branch (64:26): [True: 98.6M, False: 20.5M]
  ------------------
   65|   115M|}
av1_loopfilter.c:get_segdata:
   95|  96.4M|                              SEG_LVL_FEATURES feature_id) {
   96|  96.4M|  return seg->feature_data[segment_id][feature_id];
   97|  96.4M|}
quant_common.c:segfeature_active:
   63|  18.8M|                                    SEG_LVL_FEATURES feature_id) {
   64|  18.8M|  return seg->enabled && (seg->feature_mask[segment_id] & (1 << feature_id));
  ------------------
  |  Branch (64:10): [True: 14.0M, False: 4.79M]
  |  Branch (64:26): [True: 6.87M, False: 7.13M]
  ------------------
   65|  18.8M|}
quant_common.c:get_segdata:
   95|  6.87M|                              SEG_LVL_FEATURES feature_id) {
   96|  6.87M|  return seg->feature_data[segment_id][feature_id];
   97|  6.87M|}

av1_loop_filter_alloc:
   67|  3.41k|                           int width, int num_workers) {
   68|  3.41k|  lf_sync->rows = rows;
   69|  3.41k|#if CONFIG_MULTITHREAD
   70|  3.41k|  {
   71|  3.41k|    int i, j;
   72|       |
   73|  13.6k|    for (j = 0; j < MAX_MB_PLANE; j++) {
  ------------------
  |  |   36|  13.6k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (73:17): [True: 10.2k, False: 3.41k]
  ------------------
   74|  10.2k|      CHECK_MEM_ERROR(cm, lf_sync->mutex_[j],
  ------------------
  |  |   51|  10.2k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  10.2k|  do {                                                    \
  |  |  |  |   69|  10.2k|    lval = (expr);                                        \
  |  |  |  |   70|  10.2k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 10.2k]
  |  |  |  |  ------------------
  |  |  |  |   71|  10.2k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  10.2k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   75|  10.2k|                      aom_malloc(sizeof(*(lf_sync->mutex_[j])) * rows));
   76|  10.2k|      if (lf_sync->mutex_[j]) {
  ------------------
  |  Branch (76:11): [True: 10.2k, False: 0]
  ------------------
   77|  44.9k|        for (i = 0; i < rows; ++i) {
  ------------------
  |  Branch (77:21): [True: 34.7k, False: 10.2k]
  ------------------
   78|  34.7k|          pthread_mutex_init(&lf_sync->mutex_[j][i], NULL);
   79|  34.7k|        }
   80|  10.2k|      }
   81|       |
   82|  10.2k|      CHECK_MEM_ERROR(cm, lf_sync->cond_[j],
  ------------------
  |  |   51|  10.2k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  10.2k|  do {                                                    \
  |  |  |  |   69|  10.2k|    lval = (expr);                                        \
  |  |  |  |   70|  10.2k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 10.2k]
  |  |  |  |  ------------------
  |  |  |  |   71|  10.2k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  10.2k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   83|  10.2k|                      aom_malloc(sizeof(*(lf_sync->cond_[j])) * rows));
   84|  10.2k|      if (lf_sync->cond_[j]) {
  ------------------
  |  Branch (84:11): [True: 10.2k, False: 0]
  ------------------
   85|  44.9k|        for (i = 0; i < rows; ++i) {
  ------------------
  |  Branch (85:21): [True: 34.7k, False: 10.2k]
  ------------------
   86|  34.7k|          pthread_cond_init(&lf_sync->cond_[j][i], NULL);
   87|  34.7k|        }
   88|  10.2k|      }
   89|  10.2k|    }
   90|       |
   91|  3.41k|    CHECK_MEM_ERROR(cm, lf_sync->job_mutex,
  ------------------
  |  |   51|  3.41k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  3.41k|  do {                                                    \
  |  |  |  |   69|  3.41k|    lval = (expr);                                        \
  |  |  |  |   70|  3.41k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 3.41k]
  |  |  |  |  ------------------
  |  |  |  |   71|  3.41k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  3.41k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   92|  3.41k|                    aom_malloc(sizeof(*(lf_sync->job_mutex))));
   93|  3.41k|    if (lf_sync->job_mutex) {
  ------------------
  |  Branch (93:9): [True: 3.41k, False: 0]
  ------------------
   94|  3.41k|      pthread_mutex_init(lf_sync->job_mutex, NULL);
   95|  3.41k|    }
   96|  3.41k|  }
   97|  3.41k|#endif  // CONFIG_MULTITHREAD
   98|  3.41k|  CHECK_MEM_ERROR(cm, lf_sync->lfdata,
  ------------------
  |  |   51|  3.41k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  3.41k|  do {                                                    \
  |  |  |  |   69|  3.41k|    lval = (expr);                                        \
  |  |  |  |   70|  3.41k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 3.41k]
  |  |  |  |  ------------------
  |  |  |  |   71|  3.41k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  3.41k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   99|  3.41k|                  aom_malloc(num_workers * sizeof(*(lf_sync->lfdata))));
  100|  3.41k|  lf_sync->num_workers = num_workers;
  101|       |
  102|  13.6k|  for (int j = 0; j < MAX_MB_PLANE; j++) {
  ------------------
  |  |   36|  13.6k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (102:19): [True: 10.2k, False: 3.41k]
  ------------------
  103|  10.2k|    CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col[j],
  ------------------
  |  |   51|  10.2k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  10.2k|  do {                                                    \
  |  |  |  |   69|  10.2k|    lval = (expr);                                        \
  |  |  |  |   70|  10.2k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 10.2k]
  |  |  |  |  ------------------
  |  |  |  |   71|  10.2k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  10.2k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  104|  10.2k|                    aom_malloc(sizeof(*(lf_sync->cur_sb_col[j])) * rows));
  105|  10.2k|  }
  106|  3.41k|  CHECK_MEM_ERROR(
  ------------------
  |  |   51|  3.41k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  3.41k|  do {                                                    \
  |  |  |  |   69|  3.41k|    lval = (expr);                                        \
  |  |  |  |   70|  3.41k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 3.41k]
  |  |  |  |  ------------------
  |  |  |  |   71|  3.41k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  3.41k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  107|  3.41k|      cm, lf_sync->job_queue,
  108|  3.41k|      aom_malloc(sizeof(*(lf_sync->job_queue)) * rows * MAX_MB_PLANE * 2));
  109|       |  // Set up nsync.
  110|  3.41k|  lf_sync->sync_range = get_sync_range(width);
  111|  3.41k|}
av1_loop_filter_dealloc:
  114|  6.93k|void av1_loop_filter_dealloc(AV1LfSync *lf_sync) {
  115|  6.93k|  if (lf_sync != NULL) {
  ------------------
  |  Branch (115:7): [True: 6.93k, False: 0]
  ------------------
  116|  6.93k|    int j;
  117|  6.93k|#if CONFIG_MULTITHREAD
  118|  6.93k|    int i;
  119|  27.7k|    for (j = 0; j < MAX_MB_PLANE; j++) {
  ------------------
  |  |   36|  27.7k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (119:17): [True: 20.7k, False: 6.93k]
  ------------------
  120|  20.7k|      if (lf_sync->mutex_[j] != NULL) {
  ------------------
  |  Branch (120:11): [True: 10.2k, False: 10.5k]
  ------------------
  121|  44.9k|        for (i = 0; i < lf_sync->rows; ++i) {
  ------------------
  |  Branch (121:21): [True: 34.7k, False: 10.2k]
  ------------------
  122|  34.7k|          pthread_mutex_destroy(&lf_sync->mutex_[j][i]);
  123|  34.7k|        }
  124|  10.2k|        aom_free(lf_sync->mutex_[j]);
  125|  10.2k|      }
  126|  20.7k|      if (lf_sync->cond_[j] != NULL) {
  ------------------
  |  Branch (126:11): [True: 10.2k, False: 10.5k]
  ------------------
  127|  44.9k|        for (i = 0; i < lf_sync->rows; ++i) {
  ------------------
  |  Branch (127:21): [True: 34.7k, False: 10.2k]
  ------------------
  128|  34.7k|          pthread_cond_destroy(&lf_sync->cond_[j][i]);
  129|  34.7k|        }
  130|  10.2k|        aom_free(lf_sync->cond_[j]);
  131|  10.2k|      }
  132|  20.7k|    }
  133|  6.93k|    if (lf_sync->job_mutex != NULL) {
  ------------------
  |  Branch (133:9): [True: 3.41k, False: 3.52k]
  ------------------
  134|  3.41k|      pthread_mutex_destroy(lf_sync->job_mutex);
  135|  3.41k|      aom_free(lf_sync->job_mutex);
  136|  3.41k|    }
  137|  6.93k|#endif  // CONFIG_MULTITHREAD
  138|  6.93k|    aom_free(lf_sync->lfdata);
  139|  27.7k|    for (j = 0; j < MAX_MB_PLANE; j++) {
  ------------------
  |  |   36|  27.7k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (139:17): [True: 20.7k, False: 6.93k]
  ------------------
  140|  20.7k|      aom_free(lf_sync->cur_sb_col[j]);
  141|  20.7k|    }
  142|       |
  143|  6.93k|    aom_free(lf_sync->job_queue);
  144|       |    // clear the structure as the source of this call may be a resize in which
  145|       |    // case this call will be followed by an _alloc() which may fail.
  146|  6.93k|    av1_zero(*lf_sync);
  ------------------
  |  |   43|  6.93k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
  147|  6.93k|  }
  148|  6.93k|}
av1_alloc_cdef_sync:
  151|  91.1k|                         int num_workers) {
  152|  91.1k|  if (num_workers < 1) return;
  ------------------
  |  Branch (152:7): [True: 30.7k, False: 60.3k]
  ------------------
  153|  60.3k|#if CONFIG_MULTITHREAD
  154|  60.3k|  if (cdef_sync->mutex_ == NULL) {
  ------------------
  |  Branch (154:7): [True: 2.82k, False: 57.5k]
  ------------------
  155|  2.82k|    CHECK_MEM_ERROR(cm, cdef_sync->mutex_,
  ------------------
  |  |   51|  2.82k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  2.82k|  do {                                                    \
  |  |  |  |   69|  2.82k|    lval = (expr);                                        \
  |  |  |  |   70|  2.82k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 2.82k]
  |  |  |  |  ------------------
  |  |  |  |   71|  2.82k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  2.82k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  156|  2.82k|                    aom_malloc(sizeof(*(cdef_sync->mutex_))));
  157|  2.82k|    if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL);
  ------------------
  |  Branch (157:9): [True: 2.82k, False: 0]
  ------------------
  158|  2.82k|  }
  159|       |#else
  160|       |  (void)cm;
  161|       |  (void)cdef_sync;
  162|       |#endif  // CONFIG_MULTITHREAD
  163|  60.3k|}
av1_free_cdef_sync:
  165|  16.1k|void av1_free_cdef_sync(AV1CdefSync *cdef_sync) {
  166|  16.1k|  if (cdef_sync == NULL) return;
  ------------------
  |  Branch (166:7): [True: 0, False: 16.1k]
  ------------------
  167|  16.1k|#if CONFIG_MULTITHREAD
  168|  16.1k|  if (cdef_sync->mutex_ != NULL) {
  ------------------
  |  Branch (168:7): [True: 2.82k, False: 13.2k]
  ------------------
  169|  2.82k|    pthread_mutex_destroy(cdef_sync->mutex_);
  170|  2.82k|    aom_free(cdef_sync->mutex_);
  171|  2.82k|  }
  172|  16.1k|#endif  // CONFIG_MULTITHREAD
  173|  16.1k|}
av1_thread_loop_filter_rows:
  271|   346k|    int num_mis_in_lpf_unit_height_log2) {
  272|       |  // TODO(aomedia:3276): Pass error_info to the low-level functions as required
  273|       |  // in future to handle error propagation.
  274|   346k|  (void)error_info;
  275|   346k|  const int sb_cols =
  276|   346k|      CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, MAX_MIB_SIZE_LOG2);
  ------------------
  |  |   62|   346k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
  277|   346k|  const int r = mi_row >> num_mis_in_lpf_unit_height_log2;
  278|   346k|  int mi_col, c;
  279|       |
  280|   346k|  const bool joint_filter_chroma = (lpf_opt_level == 2) && plane > AOM_PLANE_Y;
  ------------------
  |  |  226|   346k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  |  Branch (280:36): [True: 0, False: 346k]
  |  Branch (280:60): [True: 0, False: 0]
  ------------------
  281|   346k|  const int num_planes = joint_filter_chroma ? 2 : 1;
  ------------------
  |  Branch (281:26): [True: 0, False: 346k]
  ------------------
  282|   346k|  assert(IMPLIES(joint_filter_chroma, plane == AOM_PLANE_U));
  283|       |
  284|   346k|  if (dir == 0) {
  ------------------
  |  Branch (284:7): [True: 173k, False: 173k]
  ------------------
  285|   796k|    for (mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += MAX_MIB_SIZE) {
  ------------------
  |  |   44|   623k|#define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   43|   623k|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|   623k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   39|   623k|#define MI_SIZE_LOG2 2
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  |  Branch (285:22): [True: 623k, False: 173k]
  ------------------
  286|   623k|      c = mi_col >> MAX_MIB_SIZE_LOG2;
  ------------------
  |  |   43|   623k|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   623k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   623k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  287|       |
  288|   623k|      av1_setup_dst_planes(planes, cm->seq_params->sb_size, frame_buffer,
  289|   623k|                           mi_row, mi_col, plane, plane + num_planes);
  290|   623k|      if (lpf_opt_level) {
  ------------------
  |  Branch (290:11): [True: 0, False: 623k]
  ------------------
  291|      0|        if (plane == AOM_PLANE_Y) {
  ------------------
  |  |  226|      0|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  |  Branch (291:13): [True: 0, False: 0]
  ------------------
  292|      0|          av1_filter_block_plane_vert_opt(cm, xd, &planes[plane], mi_row,
  293|      0|                                          mi_col, params_buf, tx_buf,
  294|      0|                                          num_mis_in_lpf_unit_height_log2);
  295|      0|        } else {
  296|      0|          av1_filter_block_plane_vert_opt_chroma(
  297|      0|              cm, xd, &planes[plane], mi_row, mi_col, params_buf, tx_buf, plane,
  298|      0|              joint_filter_chroma, num_mis_in_lpf_unit_height_log2);
  299|      0|        }
  300|   623k|      } else {
  301|   623k|        av1_filter_block_plane_vert(cm, xd, plane, &planes[plane], mi_row,
  302|   623k|                                    mi_col);
  303|   623k|      }
  304|   623k|      if (lf_sync != NULL) {
  ------------------
  |  Branch (304:11): [True: 551k, False: 72.1k]
  ------------------
  305|   551k|        sync_write(lf_sync, r, c, sb_cols, plane);
  306|   551k|      }
  307|   623k|    }
  308|   173k|  } else if (dir == 1) {
  ------------------
  |  Branch (308:14): [True: 173k, False: 18.4E]
  ------------------
  309|   799k|    for (mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += MAX_MIB_SIZE) {
  ------------------
  |  |   44|   626k|#define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   43|   626k|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|   626k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   39|   626k|#define MI_SIZE_LOG2 2
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  |  Branch (309:22): [True: 626k, False: 173k]
  ------------------
  310|   626k|      c = mi_col >> MAX_MIB_SIZE_LOG2;
  ------------------
  |  |   43|   626k|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   626k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   626k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  311|       |
  312|   626k|      if (lf_sync != NULL) {
  ------------------
  |  Branch (312:11): [True: 557k, False: 68.3k]
  ------------------
  313|       |        // Wait for vertical edge filtering of the top-right block to be
  314|       |        // completed
  315|   557k|        sync_read(lf_sync, r, c, plane);
  316|       |
  317|       |        // Wait for vertical edge filtering of the right block to be completed
  318|   557k|        sync_read(lf_sync, r + 1, c, plane);
  319|   557k|      }
  320|       |
  321|   626k|#if CONFIG_MULTITHREAD
  322|   626k|      if (lf_sync && lf_sync->num_workers > 1) {
  ------------------
  |  Branch (322:11): [True: 558k, False: 67.2k]
  |  Branch (322:22): [True: 558k, False: 1]
  ------------------
  323|   558k|        pthread_mutex_lock(lf_sync->job_mutex);
  324|   558k|        const bool lf_mt_exit = lf_sync->lf_mt_exit;
  325|   558k|        pthread_mutex_unlock(lf_sync->job_mutex);
  326|       |        // Exit in case any worker has encountered an error.
  327|   558k|        if (lf_mt_exit) return;
  ------------------
  |  Branch (327:13): [True: 0, False: 558k]
  ------------------
  328|   558k|      }
  329|   626k|#endif
  330|       |
  331|   626k|      av1_setup_dst_planes(planes, cm->seq_params->sb_size, frame_buffer,
  332|   626k|                           mi_row, mi_col, plane, plane + num_planes);
  333|   626k|      if (lpf_opt_level) {
  ------------------
  |  Branch (333:11): [True: 0, False: 626k]
  ------------------
  334|      0|        if (plane == AOM_PLANE_Y) {
  ------------------
  |  |  226|      0|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  |  Branch (334:13): [True: 0, False: 0]
  ------------------
  335|      0|          av1_filter_block_plane_horz_opt(cm, xd, &planes[plane], mi_row,
  336|      0|                                          mi_col, params_buf, tx_buf,
  337|      0|                                          num_mis_in_lpf_unit_height_log2);
  338|      0|        } else {
  339|      0|          av1_filter_block_plane_horz_opt_chroma(
  340|      0|              cm, xd, &planes[plane], mi_row, mi_col, params_buf, tx_buf, plane,
  341|      0|              joint_filter_chroma, num_mis_in_lpf_unit_height_log2);
  342|      0|        }
  343|   626k|      } else {
  344|   626k|        av1_filter_block_plane_horz(cm, xd, plane, &planes[plane], mi_row,
  345|   626k|                                    mi_col);
  346|   626k|      }
  347|   626k|    }
  348|   173k|  }
  349|   346k|}
av1_loop_filter_frame_mt:
  495|  30.0k|                              int lpf_opt_level) {
  496|  30.0k|  int start_mi_row, end_mi_row, mi_rows_to_filter;
  497|  30.0k|  int planes_to_lf[MAX_MB_PLANE];
  498|       |
  499|  30.0k|  if (!check_planes_to_loop_filter(&cm->lf, planes_to_lf, plane_start,
  ------------------
  |  Branch (499:7): [True: 0, False: 30.0k]
  ------------------
  500|  30.0k|                                   plane_end))
  501|      0|    return;
  502|       |
  503|  30.0k|  start_mi_row = 0;
  504|  30.0k|  mi_rows_to_filter = cm->mi_params.mi_rows;
  505|  30.0k|  if (partial_frame && cm->mi_params.mi_rows > 8) {
  ------------------
  |  Branch (505:7): [True: 0, False: 30.0k]
  |  Branch (505:24): [True: 0, False: 0]
  ------------------
  506|      0|    start_mi_row = cm->mi_params.mi_rows >> 1;
  507|      0|    start_mi_row &= 0xfffffff8;
  508|      0|    mi_rows_to_filter = AOMMAX(cm->mi_params.mi_rows / 8, 8);
  ------------------
  |  |   35|      0|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
  509|      0|  }
  510|  30.0k|  end_mi_row = start_mi_row + mi_rows_to_filter;
  511|  30.0k|  av1_loop_filter_frame_init(cm, plane_start, plane_end);
  512|       |
  513|  30.0k|  if (num_workers > 1) {
  ------------------
  |  Branch (513:7): [True: 22.7k, False: 7.29k]
  ------------------
  514|       |    // Enqueue and execute loopfiltering jobs.
  515|  22.7k|    loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, planes_to_lf,
  516|  22.7k|                        workers, num_workers, lf_sync, lpf_opt_level);
  517|  22.7k|  } else {
  518|       |    // Directly filter in the main thread.
  519|  7.29k|    loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row, planes_to_lf,
  520|  7.29k|                     lpf_opt_level);
  521|  7.29k|  }
  522|  30.0k|}
av1_loop_restoration_alloc:
  589|  1.19k|                                int num_planes, int width) {
  590|  1.19k|  lr_sync->rows = num_rows_lr;
  591|  1.19k|  lr_sync->num_planes = num_planes;
  592|  1.19k|#if CONFIG_MULTITHREAD
  593|  1.19k|  {
  594|  1.19k|    int i, j;
  595|       |
  596|  4.68k|    for (j = 0; j < num_planes; j++) {
  ------------------
  |  Branch (596:17): [True: 3.48k, False: 1.19k]
  ------------------
  597|  3.48k|      CHECK_MEM_ERROR(cm, lr_sync->mutex_[j],
  ------------------
  |  |   51|  3.48k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  3.48k|  do {                                                    \
  |  |  |  |   69|  3.48k|    lval = (expr);                                        \
  |  |  |  |   70|  3.48k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 3.48k]
  |  |  |  |  ------------------
  |  |  |  |   71|  3.48k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  3.48k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  598|  3.48k|                      aom_malloc(sizeof(*(lr_sync->mutex_[j])) * num_rows_lr));
  599|  3.48k|      if (lr_sync->mutex_[j]) {
  ------------------
  |  Branch (599:11): [True: 3.48k, False: 0]
  ------------------
  600|  14.2k|        for (i = 0; i < num_rows_lr; ++i) {
  ------------------
  |  Branch (600:21): [True: 10.8k, False: 3.48k]
  ------------------
  601|  10.8k|          pthread_mutex_init(&lr_sync->mutex_[j][i], NULL);
  602|  10.8k|        }
  603|  3.48k|      }
  604|       |
  605|  3.48k|      CHECK_MEM_ERROR(cm, lr_sync->cond_[j],
  ------------------
  |  |   51|  3.48k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  3.48k|  do {                                                    \
  |  |  |  |   69|  3.48k|    lval = (expr);                                        \
  |  |  |  |   70|  3.48k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 3.48k]
  |  |  |  |  ------------------
  |  |  |  |   71|  3.48k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  3.48k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  606|  3.48k|                      aom_malloc(sizeof(*(lr_sync->cond_[j])) * num_rows_lr));
  607|  3.48k|      if (lr_sync->cond_[j]) {
  ------------------
  |  Branch (607:11): [True: 3.48k, False: 0]
  ------------------
  608|  14.2k|        for (i = 0; i < num_rows_lr; ++i) {
  ------------------
  |  Branch (608:21): [True: 10.8k, False: 3.48k]
  ------------------
  609|  10.8k|          pthread_cond_init(&lr_sync->cond_[j][i], NULL);
  610|  10.8k|        }
  611|  3.48k|      }
  612|  3.48k|    }
  613|       |
  614|  1.19k|    CHECK_MEM_ERROR(cm, lr_sync->job_mutex,
  ------------------
  |  |   51|  1.19k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  1.19k|  do {                                                    \
  |  |  |  |   69|  1.19k|    lval = (expr);                                        \
  |  |  |  |   70|  1.19k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 1.19k]
  |  |  |  |  ------------------
  |  |  |  |   71|  1.19k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  1.19k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  615|  1.19k|                    aom_malloc(sizeof(*(lr_sync->job_mutex))));
  616|  1.19k|    if (lr_sync->job_mutex) {
  ------------------
  |  Branch (616:9): [True: 1.19k, False: 0]
  ------------------
  617|  1.19k|      pthread_mutex_init(lr_sync->job_mutex, NULL);
  618|  1.19k|    }
  619|  1.19k|  }
  620|  1.19k|#endif  // CONFIG_MULTITHREAD
  621|  1.19k|  CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata,
  ------------------
  |  |   51|  1.19k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  1.19k|  do {                                                    \
  |  |  |  |   69|  1.19k|    lval = (expr);                                        \
  |  |  |  |   70|  1.19k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 1.19k]
  |  |  |  |  ------------------
  |  |  |  |   71|  1.19k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  1.19k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  622|  1.19k|                  aom_calloc(num_workers, sizeof(*(lr_sync->lrworkerdata))));
  623|  1.19k|  lr_sync->num_workers = num_workers;
  624|       |
  625|  37.3k|  for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
  ------------------
  |  Branch (625:28): [True: 36.1k, False: 1.19k]
  ------------------
  626|  36.1k|    if (worker_idx < num_workers - 1) {
  ------------------
  |  Branch (626:9): [True: 34.9k, False: 1.19k]
  ------------------
  627|  34.9k|      CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata[worker_idx].rst_tmpbuf,
  ------------------
  |  |   51|  34.9k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  34.9k|  do {                                                    \
  |  |  |  |   69|  34.9k|    lval = (expr);                                        \
  |  |  |  |   70|  34.9k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 34.9k]
  |  |  |  |  ------------------
  |  |  |  |   71|  34.9k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  34.9k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  628|  34.9k|                      (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE));
  629|  34.9k|      CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata[worker_idx].rlbs,
  ------------------
  |  |   51|  34.9k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  34.9k|  do {                                                    \
  |  |  |  |   69|  34.9k|    lval = (expr);                                        \
  |  |  |  |   70|  34.9k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 34.9k]
  |  |  |  |  ------------------
  |  |  |  |   71|  34.9k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  34.9k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  630|  34.9k|                      aom_malloc(sizeof(RestorationLineBuffers)));
  631|       |
  632|  34.9k|    } else {
  633|  1.19k|      lr_sync->lrworkerdata[worker_idx].rst_tmpbuf = cm->rst_tmpbuf;
  634|  1.19k|      lr_sync->lrworkerdata[worker_idx].rlbs = cm->rlbs;
  635|  1.19k|    }
  636|  36.1k|  }
  637|       |
  638|  4.68k|  for (int j = 0; j < num_planes; j++) {
  ------------------
  |  Branch (638:19): [True: 3.48k, False: 1.19k]
  ------------------
  639|  3.48k|    CHECK_MEM_ERROR(
  ------------------
  |  |   51|  3.48k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  3.48k|  do {                                                    \
  |  |  |  |   69|  3.48k|    lval = (expr);                                        \
  |  |  |  |   70|  3.48k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 3.48k]
  |  |  |  |  ------------------
  |  |  |  |   71|  3.48k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  3.48k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  640|  3.48k|        cm, lr_sync->cur_sb_col[j],
  641|  3.48k|        aom_malloc(sizeof(*(lr_sync->cur_sb_col[j])) * num_rows_lr));
  642|  3.48k|  }
  643|  1.19k|  CHECK_MEM_ERROR(
  ------------------
  |  |   51|  1.19k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  1.19k|  do {                                                    \
  |  |  |  |   69|  1.19k|    lval = (expr);                                        \
  |  |  |  |   70|  1.19k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 1.19k]
  |  |  |  |  ------------------
  |  |  |  |   71|  1.19k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  1.19k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  644|  1.19k|      cm, lr_sync->job_queue,
  645|  1.19k|      aom_malloc(sizeof(*(lr_sync->job_queue)) * num_rows_lr * num_planes));
  646|       |  // Set up nsync.
  647|  1.19k|  lr_sync->sync_range = get_lr_sync_range(width);
  648|  1.19k|}
av1_loop_restoration_dealloc:
  651|  4.71k|void av1_loop_restoration_dealloc(AV1LrSync *lr_sync) {
  652|  4.71k|  if (lr_sync != NULL) {
  ------------------
  |  Branch (652:7): [True: 4.71k, False: 0]
  ------------------
  653|  4.71k|    int j;
  654|  4.71k|#if CONFIG_MULTITHREAD
  655|  4.71k|    int i;
  656|  18.8k|    for (j = 0; j < MAX_MB_PLANE; j++) {
  ------------------
  |  |   36|  18.8k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (656:17): [True: 14.1k, False: 4.71k]
  ------------------
  657|  14.1k|      if (lr_sync->mutex_[j] != NULL) {
  ------------------
  |  Branch (657:11): [True: 3.48k, False: 10.6k]
  ------------------
  658|  14.2k|        for (i = 0; i < lr_sync->rows; ++i) {
  ------------------
  |  Branch (658:21): [True: 10.8k, False: 3.48k]
  ------------------
  659|  10.8k|          pthread_mutex_destroy(&lr_sync->mutex_[j][i]);
  660|  10.8k|        }
  661|  3.48k|        aom_free(lr_sync->mutex_[j]);
  662|  3.48k|      }
  663|  14.1k|      if (lr_sync->cond_[j] != NULL) {
  ------------------
  |  Branch (663:11): [True: 3.48k, False: 10.6k]
  ------------------
  664|  14.2k|        for (i = 0; i < lr_sync->rows; ++i) {
  ------------------
  |  Branch (664:21): [True: 10.8k, False: 3.48k]
  ------------------
  665|  10.8k|          pthread_cond_destroy(&lr_sync->cond_[j][i]);
  666|  10.8k|        }
  667|  3.48k|        aom_free(lr_sync->cond_[j]);
  668|  3.48k|      }
  669|  14.1k|    }
  670|  4.71k|    if (lr_sync->job_mutex != NULL) {
  ------------------
  |  Branch (670:9): [True: 1.19k, False: 3.52k]
  ------------------
  671|  1.19k|      pthread_mutex_destroy(lr_sync->job_mutex);
  672|  1.19k|      aom_free(lr_sync->job_mutex);
  673|  1.19k|    }
  674|  4.71k|#endif  // CONFIG_MULTITHREAD
  675|  18.8k|    for (j = 0; j < MAX_MB_PLANE; j++) {
  ------------------
  |  |   36|  18.8k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (675:17): [True: 14.1k, False: 4.71k]
  ------------------
  676|  14.1k|      aom_free(lr_sync->cur_sb_col[j]);
  677|  14.1k|    }
  678|       |
  679|  4.71k|    aom_free(lr_sync->job_queue);
  680|       |
  681|  4.71k|    if (lr_sync->lrworkerdata) {
  ------------------
  |  Branch (681:9): [True: 1.19k, False: 3.52k]
  ------------------
  682|  36.1k|      for (int worker_idx = 0; worker_idx < lr_sync->num_workers - 1;
  ------------------
  |  Branch (682:32): [True: 34.9k, False: 1.19k]
  ------------------
  683|  34.9k|           worker_idx++) {
  684|  34.9k|        LRWorkerData *const workerdata_data =
  685|  34.9k|            lr_sync->lrworkerdata + worker_idx;
  686|       |
  687|  34.9k|        aom_free(workerdata_data->rst_tmpbuf);
  688|  34.9k|        aom_free(workerdata_data->rlbs);
  689|  34.9k|      }
  690|  1.19k|      aom_free(lr_sync->lrworkerdata);
  691|  1.19k|    }
  692|       |
  693|       |    // clear the structure as the source of this call may be a resize in which
  694|       |    // case this call will be followed by an _alloc() which may fail.
  695|  4.71k|    av1_zero(*lr_sync);
  ------------------
  |  |   43|  4.71k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
  696|  4.71k|  }
  697|  4.71k|}
av1_loop_restoration_filter_frame_mt:
  984|  15.7k|                                          int do_extend_border) {
  985|  15.7k|  assert(!cm->features.all_lossless);
  986|       |
  987|  15.7k|  const int num_planes = av1_num_planes(cm);
  988|       |
  989|  15.7k|  AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
  990|       |
  991|  15.7k|  av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
  992|  15.7k|                                         optimized_lr, num_planes);
  993|       |
  994|  15.7k|  foreach_rest_unit_in_planes_mt(loop_rest_ctxt, workers, num_workers, lr_sync,
  995|  15.7k|                                 cm, do_extend_border);
  996|  15.7k|}
av1_cdef_init_fb_row_mt:
 1165|  69.3k|                             struct AV1CdefSyncData *const cdef_sync, int fbr) {
 1166|  69.3k|  const int num_planes = av1_num_planes(cm);
 1167|  69.3k|  const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  69.3k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  69.3k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
                const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|  69.3k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  69.3k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1168|  69.3k|  const int luma_stride =
 1169|  69.3k|      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4);
  ------------------
  |  |   69|  69.3k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
 1170|       |
 1171|       |  // for the current filter block, it's top left corner mi structure (mi_tl)
 1172|       |  // is first accessed to check whether the top and left boundaries are
 1173|       |  // frame boundaries. Then bottom-left and top-right mi structures are
 1174|       |  // accessed to check whether the bottom and right boundaries
 1175|       |  // (respectively) are frame boundaries.
 1176|       |  //
 1177|       |  // Note that we can't just check the bottom-right mi structure - eg. if
 1178|       |  // we're at the right-hand edge of the frame but not the bottom, then
 1179|       |  // the bottom-right mi is NULL but the bottom-left is not.
 1180|  69.3k|  fb_info->frame_boundary[TOP] = (MI_SIZE_64X64 * fbr == 0) ? 1 : 0;
  ------------------
  |  |   58|  69.3k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  69.3k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  |  Branch (1180:34): [True: 19.2k, False: 50.0k]
  ------------------
 1181|  69.3k|  if (fbr != nvfb - 1)
  ------------------
  |  Branch (1181:7): [True: 49.9k, False: 19.4k]
  ------------------
 1182|  49.9k|    fb_info->frame_boundary[BOTTOM] =
 1183|  49.9k|        (MI_SIZE_64X64 * (fbr + 1) == cm->mi_params.mi_rows) ? 1 : 0;
  ------------------
  |  |   58|  49.9k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  49.9k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  |  Branch (1183:9): [True: 0, False: 49.9k]
  ------------------
 1184|  19.4k|  else
 1185|  19.4k|    fb_info->frame_boundary[BOTTOM] = 1;
 1186|       |
 1187|  69.3k|  fb_info->src = src;
 1188|  69.3k|  fb_info->damping = cm->cdef_info.cdef_damping;
 1189|  69.3k|  fb_info->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0);
  ------------------
  |  |   35|  69.3k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 32.2k, False: 37.1k]
  |  |  ------------------
  ------------------
 1190|  69.3k|  av1_zero(fb_info->dir);
  ------------------
  |  |   43|  69.3k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
 1191|  69.3k|  av1_zero(fb_info->var);
  ------------------
  |  |   43|  69.3k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
 1192|       |
 1193|   266k|  for (int plane = 0; plane < num_planes; plane++) {
  ------------------
  |  Branch (1193:23): [True: 197k, False: 69.3k]
  ------------------
 1194|   197k|    const int stride = luma_stride >> xd->plane[plane].subsampling_x;
 1195|   197k|    uint16_t *top_linebuf = &linebuf[plane][0];
 1196|   197k|    uint16_t *bot_linebuf = &linebuf[plane][nvfb * CDEF_VBORDER * stride];
  ------------------
  |  |   23|   197k|#define CDEF_VBORDER (2)
  ------------------
 1197|   197k|    {
 1198|   197k|      const int mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y;
  ------------------
  |  |   39|   197k|#define MI_SIZE_LOG2 2
  ------------------
 1199|   197k|      const int top_offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2;
  ------------------
  |  |   58|   197k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   197k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1200|   197k|      const int bot_offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2;
  ------------------
  |  |   58|   197k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   197k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1201|       |
 1202|   197k|      if (fbr != nvfb - 1)  // if (fbr != 0)  // top line buffer copy
  ------------------
  |  Branch (1202:11): [True: 144k, False: 52.9k]
  ------------------
 1203|   144k|        av1_cdef_copy_sb8_16(
 1204|   144k|            cm, &top_linebuf[(fbr + 1) * CDEF_VBORDER * stride], stride,
  ------------------
  |  |   23|   144k|#define CDEF_VBORDER (2)
  ------------------
 1205|   144k|            xd->plane[plane].dst.buf, top_offset - CDEF_VBORDER, 0,
  ------------------
  |  |   23|   144k|#define CDEF_VBORDER (2)
  ------------------
 1206|   144k|            xd->plane[plane].dst.stride, CDEF_VBORDER, stride);
  ------------------
  |  |   23|   144k|#define CDEF_VBORDER (2)
  ------------------
 1207|   197k|      if (fbr != nvfb - 1)  // bottom line buffer copy
  ------------------
  |  Branch (1207:11): [True: 143k, False: 54.4k]
  ------------------
 1208|   143k|        av1_cdef_copy_sb8_16(cm, &bot_linebuf[fbr * CDEF_VBORDER * stride],
  ------------------
  |  |   23|   143k|#define CDEF_VBORDER (2)
  ------------------
 1209|   143k|                             stride, xd->plane[plane].dst.buf, bot_offset, 0,
 1210|   143k|                             xd->plane[plane].dst.stride, CDEF_VBORDER, stride);
  ------------------
  |  |   23|   143k|#define CDEF_VBORDER (2)
  ------------------
 1211|   197k|    }
 1212|       |
 1213|   197k|    fb_info->top_linebuf[plane] = &linebuf[plane][fbr * CDEF_VBORDER * stride];
  ------------------
  |  |   23|   197k|#define CDEF_VBORDER (2)
  ------------------
 1214|   197k|    fb_info->bot_linebuf[plane] =
 1215|   197k|        &linebuf[plane]
 1216|   197k|                [nvfb * CDEF_VBORDER * stride + (fbr * CDEF_VBORDER * stride)];
  ------------------
  |  |   23|   197k|#define CDEF_VBORDER (2)
  ------------------
                              [nvfb * CDEF_VBORDER * stride + (fbr * CDEF_VBORDER * stride)];
  ------------------
  |  |   23|   197k|#define CDEF_VBORDER (2)
  ------------------
 1217|   197k|  }
 1218|       |
 1219|  69.3k|  cdef_row_mt_sync_write(cdef_sync, fbr);
 1220|  69.3k|  cdef_row_mt_sync_read(cdef_sync, fbr);
 1221|  69.3k|}
av1_cdef_frame_mt:
 1235|  19.3k|                       int do_extend_border) {
 1236|  19.3k|  YV12_BUFFER_CONFIG *frame = &cm->cur_frame->buf;
 1237|  19.3k|  const int num_planes = av1_num_planes(cm);
 1238|       |
 1239|  19.3k|  av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0,
 1240|  19.3k|                       num_planes);
 1241|       |
 1242|  19.3k|  reset_cdef_job_info(cdef_sync);
 1243|  19.3k|  prepare_cdef_frame_workers(cm, xd, cdef_worker, cdef_sb_row_worker_hook,
 1244|  19.3k|                             workers, cdef_sync, num_workers,
 1245|  19.3k|                             cdef_init_fb_row_fn, do_extend_border);
 1246|  19.3k|  launch_cdef_workers(workers, num_workers);
 1247|  19.3k|  sync_cdef_workers(workers, cm, num_workers);
 1248|  19.3k|}
av1_get_intrabc_extra_top_right_sb_delay:
 1250|  91.5k|int av1_get_intrabc_extra_top_right_sb_delay(const AV1_COMMON *cm) {
 1251|       |  // No additional top-right delay when intraBC tool is not enabled.
 1252|  91.5k|  if (!av1_allow_intrabc(cm)) return 0;
  ------------------
  |  Branch (1252:7): [True: 84.4k, False: 7.06k]
  ------------------
 1253|       |  // Due to the hardware constraints on processing the intraBC tool with row
 1254|       |  // multithreading, a top-right delay of 3 superblocks of size 128x128 or 5
 1255|       |  // superblocks of size 64x64 is mandated. However, a minimum top-right delay
 1256|       |  // of 1 superblock is assured with 'sync_range'. Hence return only the
 1257|       |  // additional superblock delay when the intraBC tool is enabled.
 1258|  7.06k|  return cm->seq_params->sb_size == BLOCK_128X128 ? 2 : 4;
  ------------------
  |  Branch (1258:10): [True: 5.67k, False: 1.38k]
  ------------------
 1259|  91.5k|}
thread_common.c:get_sync_range:
   32|  3.41k|static inline int get_sync_range(int width) {
   33|       |  // nsync numbers are picked by testing. For example, for 4k
   34|       |  // video, using 4 gives best performance.
   35|  3.41k|  if (width < 640)
  ------------------
  |  Branch (35:7): [True: 3.17k, False: 234]
  ------------------
   36|  3.17k|    return 1;
   37|    234|  else if (width <= 1280)
  ------------------
  |  Branch (37:12): [True: 25, False: 209]
  ------------------
   38|     25|    return 2;
   39|    209|  else if (width <= 4096)
  ------------------
  |  Branch (39:12): [True: 159, False: 50]
  ------------------
   40|    159|    return 4;
   41|     50|  else
   42|     50|    return 8;
   43|  3.41k|}
thread_common.c:sync_write:
  228|   551k|                              const int sb_cols, int plane) {
  229|   551k|#if CONFIG_MULTITHREAD
  230|   551k|  const int nsync = lf_sync->sync_range;
  231|   551k|  int cur;
  232|       |  // Only signal when there are enough filtered SB for next row to run.
  233|   551k|  int sig = 1;
  234|       |
  235|   551k|  if (c < sb_cols - 1) {
  ------------------
  |  Branch (235:7): [True: 410k, False: 140k]
  ------------------
  236|   410k|    cur = c;
  237|   410k|    if (c % nsync) sig = 0;
  ------------------
  |  Branch (237:9): [True: 219k, False: 191k]
  ------------------
  238|   410k|  } else {
  239|   140k|    cur = sb_cols + nsync;
  240|   140k|  }
  241|       |
  242|   551k|  if (sig) {
  ------------------
  |  Branch (242:7): [True: 334k, False: 216k]
  ------------------
  243|   334k|    pthread_mutex_lock(&lf_sync->mutex_[plane][r]);
  244|       |
  245|       |    // When a thread encounters an error, cur_sb_col[plane][r] is set to maximum
  246|       |    // column number. In this case, the AOMMAX operation here ensures that
  247|       |    // cur_sb_col[plane][r] is not overwritten with a smaller value thus
  248|       |    // preventing the infinite waiting of threads in the relevant sync_read()
  249|       |    // function.
  250|   334k|    lf_sync->cur_sb_col[plane][r] = AOMMAX(lf_sync->cur_sb_col[plane][r], cur);
  ------------------
  |  |   35|   334k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 334k]
  |  |  ------------------
  ------------------
  251|       |
  252|   334k|    pthread_cond_broadcast(&lf_sync->cond_[plane][r]);
  253|   334k|    pthread_mutex_unlock(&lf_sync->mutex_[plane][r]);
  254|   334k|  }
  255|       |#else
  256|       |  (void)lf_sync;
  257|       |  (void)r;
  258|       |  (void)c;
  259|       |  (void)sb_cols;
  260|       |  (void)plane;
  261|       |#endif  // CONFIG_MULTITHREAD
  262|   551k|}
thread_common.c:sync_read:
  206|  1.11M|                             int plane) {
  207|  1.11M|#if CONFIG_MULTITHREAD
  208|  1.11M|  const int nsync = lf_sync->sync_range;
  209|       |
  210|  1.11M|  if (r && !(c & (nsync - 1))) {
  ------------------
  |  Branch (210:7): [True: 982k, False: 132k]
  |  Branch (210:12): [True: 557k, False: 424k]
  ------------------
  211|   557k|    pthread_mutex_t *const mutex = &lf_sync->mutex_[plane][r - 1];
  212|   557k|    pthread_mutex_lock(mutex);
  213|       |
  214|   677k|    while (c > lf_sync->cur_sb_col[plane][r - 1] - nsync) {
  ------------------
  |  Branch (214:12): [True: 120k, False: 557k]
  ------------------
  215|   120k|      pthread_cond_wait(&lf_sync->cond_[plane][r - 1], mutex);
  216|   120k|    }
  217|   557k|    pthread_mutex_unlock(mutex);
  218|   557k|  }
  219|       |#else
  220|       |  (void)lf_sync;
  221|       |  (void)r;
  222|       |  (void)c;
  223|       |  (void)plane;
  224|       |#endif  // CONFIG_MULTITHREAD
  225|  1.11M|}
thread_common.c:loop_filter_rows_mt:
  435|  22.7k|                                AV1LfSync *lf_sync, int lpf_opt_level) {
  436|  22.7k|  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
  437|  22.7k|  int i;
  438|  22.7k|  loop_filter_frame_mt_init(cm, start, stop, planes_to_lf, num_workers, lf_sync,
  439|  22.7k|                            lpf_opt_level, MAX_MIB_SIZE_LOG2);
  ------------------
  |  |   43|  22.7k|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  22.7k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  22.7k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  440|       |
  441|       |  // Set up loopfilter thread data.
  442|   961k|  for (i = num_workers - 1; i >= 0; --i) {
  ------------------
  |  Branch (442:29): [True: 938k, False: 22.7k]
  ------------------
  443|   938k|    AVxWorker *const worker = &workers[i];
  444|   938k|    LFWorkerData *const lf_data = &lf_sync->lfdata[i];
  445|       |
  446|   938k|    worker->hook = loop_filter_row_worker;
  447|   938k|    worker->data1 = lf_sync;
  448|   938k|    worker->data2 = lf_data;
  449|       |
  450|       |    // Loopfilter data
  451|   938k|    loop_filter_data_reset(lf_data, frame, cm, xd);
  452|       |
  453|       |    // Start loopfiltering
  454|   938k|    worker->had_error = 0;
  455|   938k|    if (i == 0) {
  ------------------
  |  Branch (455:9): [True: 22.7k, False: 915k]
  ------------------
  456|  22.7k|      winterface->execute(worker);
  457|   915k|    } else {
  458|   915k|      winterface->launch(worker);
  459|   915k|    }
  460|   938k|  }
  461|       |
  462|  22.7k|  sync_lf_workers(workers, cm, num_workers);
  463|  22.7k|}
thread_common.c:loop_filter_row_worker:
  393|   936k|static int loop_filter_row_worker(void *arg1, void *arg2) {
  394|   936k|  AV1LfSync *const lf_sync = (AV1LfSync *)arg1;
  395|   936k|  LFWorkerData *const lf_data = (LFWorkerData *)arg2;
  396|   936k|  AV1LfMTInfo *cur_job_info;
  397|       |
  398|   936k|#if CONFIG_MULTITHREAD
  399|   936k|  pthread_mutex_t *job_mutex_ = lf_sync->job_mutex;
  400|   936k|#endif
  401|       |
  402|   936k|  struct aom_internal_error_info *const error_info = &lf_data->error_info;
  403|       |
  404|       |  // The jmp_buf is valid only for the duration of the function that calls
  405|       |  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
  406|       |  // before it returns.
  407|   936k|  if (setjmp(error_info->jmp)) {
  408|      0|    error_info->setjmp = 0;
  409|      0|#if CONFIG_MULTITHREAD
  410|      0|    pthread_mutex_lock(job_mutex_);
  411|      0|    lf_sync->lf_mt_exit = true;
  412|      0|    pthread_mutex_unlock(job_mutex_);
  413|      0|#endif
  414|      0|    av1_set_vert_loop_filter_done(lf_data->cm, lf_sync, MAX_MIB_SIZE_LOG2);
  ------------------
  |  |   43|      0|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|      0|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  415|      0|    return 0;
  416|      0|  }
  417|   936k|  error_info->setjmp = 1;
  418|       |
  419|  1.22M|  while ((cur_job_info = get_lf_job_info(lf_sync)) != NULL) {
  ------------------
  |  Branch (419:10): [True: 283k, False: 936k]
  ------------------
  420|   283k|    const int lpf_opt_level = cur_job_info->lpf_opt_level;
  421|   283k|    av1_thread_loop_filter_rows(
  422|   283k|        lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->xd,
  423|   283k|        cur_job_info->mi_row, cur_job_info->plane, cur_job_info->dir,
  424|   283k|        lpf_opt_level, lf_sync, error_info, lf_data->params_buf,
  425|   283k|        lf_data->tx_buf, MAX_MIB_SIZE_LOG2);
  ------------------
  |  |   43|   283k|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   283k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   283k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  426|   283k|  }
  427|   936k|  error_info->setjmp = 0;
  428|   936k|  return 1;
  429|   936k|}
thread_common.c:sync_lf_workers:
  370|  22.7k|                                   AV1_COMMON *const cm, int num_workers) {
  371|  22.7k|  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
  372|  22.7k|  int had_error = workers[0].had_error;
  373|  22.7k|  struct aom_internal_error_info error_info;
  374|       |
  375|       |  // Read the error_info of main thread.
  376|  22.7k|  if (had_error) {
  ------------------
  |  Branch (376:7): [True: 0, False: 22.7k]
  ------------------
  377|      0|    AVxWorker *const worker = &workers[0];
  378|      0|    error_info = ((LFWorkerData *)worker->data2)->error_info;
  379|      0|  }
  380|       |
  381|       |  // Wait till all rows are finished.
  382|   938k|  for (int i = num_workers - 1; i > 0; --i) {
  ------------------
  |  Branch (382:33): [True: 915k, False: 22.7k]
  ------------------
  383|   915k|    AVxWorker *const worker = &workers[i];
  384|   915k|    if (!winterface->sync(worker)) {
  ------------------
  |  Branch (384:9): [True: 0, False: 915k]
  ------------------
  385|      0|      had_error = 1;
  386|      0|      error_info = ((LFWorkerData *)worker->data2)->error_info;
  387|      0|    }
  388|   915k|  }
  389|  22.7k|  if (had_error) aom_internal_error_copy(cm->error, &error_info);
  ------------------
  |  Branch (389:7): [True: 0, False: 22.7k]
  ------------------
  390|  22.7k|}
thread_common.c:loop_filter_rows:
  468|  7.29k|                             int lpf_opt_level) {
  469|       |  // Filter top rows of all planes first, in case the output can be partially
  470|       |  // reconstructed row by row.
  471|  7.29k|  int mi_row, plane, dir;
  472|       |
  473|  7.29k|  AV1_DEBLOCKING_PARAMETERS params_buf[MAX_MIB_SIZE];
  474|  7.29k|  TX_SIZE tx_buf[MAX_MIB_SIZE];
  475|  20.1k|  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
  ------------------
  |  |   44|  12.8k|#define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   43|  12.8k|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|  12.8k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   39|  12.8k|#define MI_SIZE_LOG2 2
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  |  Branch (475:24): [True: 12.8k, False: 7.29k]
  ------------------
  476|  51.3k|    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
  ------------------
  |  |   36|  51.3k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (476:21): [True: 38.5k, False: 12.8k]
  ------------------
  477|  38.5k|      if (skip_loop_filter_plane(planes_to_lf, plane, lpf_opt_level)) {
  ------------------
  |  Branch (477:11): [True: 7.10k, False: 31.4k]
  ------------------
  478|  7.10k|        continue;
  479|  7.10k|      }
  480|       |
  481|  94.3k|      for (dir = 0; dir < 2; ++dir) {
  ------------------
  |  Branch (481:21): [True: 62.8k, False: 31.4k]
  ------------------
  482|  62.8k|        av1_thread_loop_filter_rows(frame, cm, xd->plane, xd, mi_row, plane,
  483|  62.8k|                                    dir, lpf_opt_level, /*lf_sync=*/NULL,
  484|  62.8k|                                    xd->error_info, params_buf, tx_buf,
  485|  62.8k|                                    MAX_MIB_SIZE_LOG2);
  ------------------
  |  |   43|  62.8k|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  62.8k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  62.8k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  486|  62.8k|      }
  487|  31.4k|    }
  488|  12.8k|  }
  489|  7.29k|}
thread_common.c:get_lr_sync_range:
   46|  1.19k|static inline int get_lr_sync_range(int width) {
   47|       |#if 0
   48|       |  // nsync numbers are picked by testing. For example, for 4k
   49|       |  // video, using 4 gives best performance.
   50|       |  if (width < 640)
   51|       |    return 1;
   52|       |  else if (width <= 1280)
   53|       |    return 2;
   54|       |  else if (width <= 4096)
   55|       |    return 4;
   56|       |  else
   57|       |    return 8;
   58|       |#else
   59|  1.19k|  (void)width;
   60|  1.19k|  return 1;
   61|  1.19k|#endif
   62|  1.19k|}
thread_common.c:foreach_rest_unit_in_planes_mt:
  923|  15.7k|                                           int do_extend_border) {
  924|  15.7k|  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
  925|       |
  926|  15.7k|  const int num_planes = av1_num_planes(cm);
  927|       |
  928|  15.7k|  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
  929|  15.7k|  int num_rows_lr = 0;
  930|       |
  931|  60.9k|  for (int plane = 0; plane < num_planes; plane++) {
  ------------------
  |  Branch (931:23): [True: 45.2k, False: 15.7k]
  ------------------
  932|  45.2k|    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
  ------------------
  |  Branch (932:9): [True: 9.08k, False: 36.1k]
  ------------------
  933|       |
  934|  36.1k|    const int plane_h = ctxt[plane].plane_h;
  935|  36.1k|    const int unit_size = cm->rst_info[plane].restoration_unit_size;
  936|       |
  937|  36.1k|    num_rows_lr = AOMMAX(num_rows_lr, av1_lr_count_units(unit_size, plane_h));
  ------------------
  |  |   35|  36.1k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 2.26k, False: 33.8k]
  |  |  ------------------
  ------------------
  938|  36.1k|  }
  939|       |
  940|  15.7k|  int i;
  941|  15.7k|  assert(MAX_MB_PLANE == 3);
  942|       |
  943|  15.7k|  if (!lr_sync->sync_range || num_rows_lr > lr_sync->rows ||
  ------------------
  |  Branch (943:7): [True: 1.18k, False: 14.5k]
  |  Branch (943:31): [True: 9, False: 14.5k]
  ------------------
  944|  15.7k|      num_workers > lr_sync->num_workers || num_planes > lr_sync->num_planes) {
  ------------------
  |  Branch (944:7): [True: 0, False: 14.5k]
  |  Branch (944:45): [True: 1, False: 14.5k]
  ------------------
  945|  1.19k|    av1_loop_restoration_dealloc(lr_sync);
  946|  1.19k|    av1_loop_restoration_alloc(lr_sync, cm, num_workers, num_rows_lr,
  947|  1.19k|                               num_planes, cm->width);
  948|  1.19k|  }
  949|  15.7k|  lr_sync->lr_mt_exit = false;
  950|       |
  951|       |  // Initialize cur_sb_col to -1 for all SB rows.
  952|  60.9k|  for (i = 0; i < num_planes; i++) {
  ------------------
  |  Branch (952:15): [True: 45.2k, False: 15.7k]
  ------------------
  953|  45.2k|    memset(lr_sync->cur_sb_col[i], -1,
  954|  45.2k|           sizeof(*(lr_sync->cur_sb_col[i])) * num_rows_lr);
  955|  45.2k|  }
  956|       |
  957|  15.7k|  enqueue_lr_jobs(lr_sync, lr_ctxt, cm);
  958|       |
  959|       |  // Set up looprestoration thread data.
  960|   642k|  for (i = num_workers - 1; i >= 0; --i) {
  ------------------
  |  Branch (960:29): [True: 626k, False: 15.7k]
  ------------------
  961|   626k|    AVxWorker *const worker = &workers[i];
  962|   626k|    lr_sync->lrworkerdata[i].lr_ctxt = (void *)lr_ctxt;
  963|   626k|    lr_sync->lrworkerdata[i].do_extend_border = do_extend_border;
  964|   626k|    worker->hook = loop_restoration_row_worker;
  965|   626k|    worker->data1 = lr_sync;
  966|   626k|    worker->data2 = &lr_sync->lrworkerdata[i];
  967|       |
  968|       |    // Start loop restoration
  969|   626k|    worker->had_error = 0;
  970|   626k|    if (i == 0) {
  ------------------
  |  Branch (970:9): [True: 15.7k, False: 611k]
  ------------------
  971|  15.7k|      winterface->execute(worker);
  972|   611k|    } else {
  973|   611k|      winterface->launch(worker);
  974|   611k|    }
  975|   626k|  }
  976|       |
  977|  15.7k|  sync_lr_workers(workers, cm, num_workers);
  978|  15.7k|}
thread_common.c:enqueue_lr_jobs:
  700|  15.7k|                            AV1_COMMON *cm) {
  701|  15.7k|  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
  702|       |
  703|  15.7k|  const int num_planes = av1_num_planes(cm);
  704|  15.7k|  AV1LrMTInfo *lr_job_queue = lr_sync->job_queue;
  705|  15.7k|  int32_t lr_job_counter[2], num_even_lr_jobs = 0;
  706|  15.7k|  lr_sync->jobs_enqueued = 0;
  707|  15.7k|  lr_sync->jobs_dequeued = 0;
  708|       |
  709|  60.9k|  for (int plane = 0; plane < num_planes; plane++) {
  ------------------
  |  Branch (709:23): [True: 45.2k, False: 15.7k]
  ------------------
  710|  45.2k|    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
  ------------------
  |  Branch (710:9): [True: 9.08k, False: 36.1k]
  ------------------
  711|  36.1k|    num_even_lr_jobs =
  712|  36.1k|        num_even_lr_jobs + ((ctxt[plane].rsi->vert_units + 1) >> 1);
  713|  36.1k|  }
  714|  15.7k|  lr_job_counter[0] = 0;
  715|  15.7k|  lr_job_counter[1] = num_even_lr_jobs;
  716|       |
  717|  60.9k|  for (int plane = 0; plane < num_planes; plane++) {
  ------------------
  |  Branch (717:23): [True: 45.2k, False: 15.7k]
  ------------------
  718|  45.2k|    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
  ------------------
  |  Branch (718:9): [True: 9.08k, False: 36.1k]
  ------------------
  719|  36.1k|    const int is_uv = plane > 0;
  720|  36.1k|    const int ss_y = is_uv && cm->seq_params->subsampling_y;
  ------------------
  |  Branch (720:22): [True: 21.1k, False: 14.9k]
  |  Branch (720:31): [True: 15.9k, False: 5.25k]
  ------------------
  721|  36.1k|    const int unit_size = ctxt[plane].rsi->restoration_unit_size;
  722|  36.1k|    const int plane_h = ctxt[plane].plane_h;
  723|  36.1k|    const int ext_size = unit_size * 3 / 2;
  724|       |
  725|  36.1k|    int y0 = 0, i = 0;
  726|  88.9k|    while (y0 < plane_h) {
  ------------------
  |  Branch (726:12): [True: 52.7k, False: 36.1k]
  ------------------
  727|  52.7k|      int remaining_h = plane_h - y0;
  728|  52.7k|      int h = (remaining_h < ext_size) ? remaining_h : unit_size;
  ------------------
  |  Branch (728:15): [True: 36.1k, False: 16.6k]
  ------------------
  729|       |
  730|  52.7k|      RestorationTileLimits limits;
  731|  52.7k|      limits.v_start = y0;
  732|  52.7k|      limits.v_end = y0 + h;
  733|  52.7k|      assert(limits.v_end <= plane_h);
  734|       |      // Offset upwards to align with the restoration processing stripe
  735|  52.7k|      const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
  ------------------
  |  |   37|  52.7k|#define RESTORATION_UNIT_OFFSET 8
  ------------------
  736|  52.7k|      limits.v_start = AOMMAX(0, limits.v_start - voffset);
  ------------------
  |  |   35|  52.7k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 36.1k, False: 16.6k]
  |  |  ------------------
  ------------------
  737|  52.7k|      if (limits.v_end < plane_h) limits.v_end -= voffset;
  ------------------
  |  Branch (737:11): [True: 16.6k, False: 36.1k]
  ------------------
  738|       |
  739|  52.7k|      assert(lr_job_counter[0] <= num_even_lr_jobs);
  740|       |
  741|  52.7k|      lr_job_queue[lr_job_counter[i & 1]].lr_unit_row = i;
  742|  52.7k|      lr_job_queue[lr_job_counter[i & 1]].plane = plane;
  743|  52.7k|      lr_job_queue[lr_job_counter[i & 1]].v_start = limits.v_start;
  744|  52.7k|      lr_job_queue[lr_job_counter[i & 1]].v_end = limits.v_end;
  745|  52.7k|      lr_job_queue[lr_job_counter[i & 1]].sync_mode = i & 1;
  746|  52.7k|      if ((i & 1) == 0) {
  ------------------
  |  Branch (746:11): [True: 37.7k, False: 15.0k]
  ------------------
  747|  37.7k|        lr_job_queue[lr_job_counter[i & 1]].v_copy_start =
  748|  37.7k|            limits.v_start + RESTORATION_BORDER;
  ------------------
  |  |   62|  37.7k|#define RESTORATION_BORDER 3
  ------------------
  749|  37.7k|        lr_job_queue[lr_job_counter[i & 1]].v_copy_end =
  750|  37.7k|            limits.v_end - RESTORATION_BORDER;
  ------------------
  |  |   62|  37.7k|#define RESTORATION_BORDER 3
  ------------------
  751|  37.7k|        if (i == 0) {
  ------------------
  |  Branch (751:13): [True: 36.1k, False: 1.62k]
  ------------------
  752|  36.1k|          assert(limits.v_start == 0);
  753|  36.1k|          lr_job_queue[lr_job_counter[i & 1]].v_copy_start = 0;
  754|  36.1k|        }
  755|  37.7k|        if (i == (ctxt[plane].rsi->vert_units - 1)) {
  ------------------
  |  Branch (755:13): [True: 22.7k, False: 15.0k]
  ------------------
  756|  22.7k|          assert(limits.v_end == plane_h);
  757|  22.7k|          lr_job_queue[lr_job_counter[i & 1]].v_copy_end = plane_h;
  758|  22.7k|        }
  759|  37.7k|      } else {
  760|  15.0k|        lr_job_queue[lr_job_counter[i & 1]].v_copy_start =
  761|  15.0k|            AOMMAX(limits.v_start - RESTORATION_BORDER, 0);
  ------------------
  |  |   35|  15.0k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 15.0k, False: 0]
  |  |  ------------------
  ------------------
  762|  15.0k|        lr_job_queue[lr_job_counter[i & 1]].v_copy_end =
  763|  15.0k|            AOMMIN(limits.v_end + RESTORATION_BORDER, plane_h);
  ------------------
  |  |   34|  15.0k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.62k, False: 13.3k]
  |  |  ------------------
  ------------------
  764|  15.0k|      }
  765|  52.7k|      lr_job_counter[i & 1]++;
  766|  52.7k|      lr_sync->jobs_enqueued++;
  767|       |
  768|  52.7k|      y0 += h;
  769|  52.7k|      ++i;
  770|  52.7k|    }
  771|  36.1k|  }
  772|  15.7k|}
thread_common.c:loop_restoration_row_worker:
  814|   625k|static int loop_restoration_row_worker(void *arg1, void *arg2) {
  815|   625k|  AV1LrSync *const lr_sync = (AV1LrSync *)arg1;
  816|   625k|  LRWorkerData *lrworkerdata = (LRWorkerData *)arg2;
  817|   625k|  AV1LrStruct *lr_ctxt = (AV1LrStruct *)lrworkerdata->lr_ctxt;
  818|   625k|  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
  819|   625k|  int lr_unit_row;
  820|   625k|  int plane;
  821|   625k|  int plane_w;
  822|   625k|#if CONFIG_MULTITHREAD
  823|   625k|  pthread_mutex_t *job_mutex_ = lr_sync->job_mutex;
  824|   625k|#endif
  825|   625k|  struct aom_internal_error_info *const error_info = &lrworkerdata->error_info;
  826|       |
  827|       |  // The jmp_buf is valid only for the duration of the function that calls
  828|       |  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
  829|       |  // before it returns.
  830|   625k|  if (setjmp(error_info->jmp)) {
  831|      0|    error_info->setjmp = 0;
  832|      0|#if CONFIG_MULTITHREAD
  833|      0|    pthread_mutex_lock(job_mutex_);
  834|      0|    lr_sync->lr_mt_exit = true;
  835|      0|    pthread_mutex_unlock(job_mutex_);
  836|      0|#endif
  837|       |    // In case of loop restoration multithreading, the worker on an even lr
  838|       |    // block row waits for the completion of the filtering of the top-right and
  839|       |    // bottom-right blocks. Hence, in case a thread (main/worker) encounters an
  840|       |    // error, update that filtering of every row in the frame is complete in
  841|       |    // order to avoid the dependent workers from waiting indefinitely.
  842|      0|    set_loop_restoration_done(lr_sync, lr_ctxt->ctxt);
  843|      0|    return 0;
  844|      0|  }
  845|   625k|  error_info->setjmp = 1;
  846|       |
  847|   625k|  typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
  848|   625k|                           YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
  849|   625k|                           int vstart, int vend);
  850|   625k|  static const copy_fun copy_funs[MAX_MB_PLANE] = {
  851|   625k|    aom_yv12_partial_coloc_copy_y, aom_yv12_partial_coloc_copy_u,
  ------------------
  |  |   58|   625k|#define aom_yv12_partial_coloc_copy_y aom_yv12_partial_coloc_copy_y_c
  ------------------
                  aom_yv12_partial_coloc_copy_y, aom_yv12_partial_coloc_copy_u,
  ------------------
  |  |   52|   625k|#define aom_yv12_partial_coloc_copy_u aom_yv12_partial_coloc_copy_u_c
  ------------------
  852|   625k|    aom_yv12_partial_coloc_copy_v
  ------------------
  |  |   55|   625k|#define aom_yv12_partial_coloc_copy_v aom_yv12_partial_coloc_copy_v_c
  ------------------
  853|   625k|  };
  854|       |
  855|   678k|  while (1) {
  ------------------
  |  Branch (855:10): [Folded - Ignored]
  ------------------
  856|   676k|    AV1LrMTInfo *cur_job_info = get_lr_job_info(lr_sync);
  857|   676k|    if (cur_job_info != NULL) {
  ------------------
  |  Branch (857:9): [True: 52.7k, False: 624k]
  ------------------
  858|  52.7k|      RestorationTileLimits limits;
  859|  52.7k|      sync_read_fn_t on_sync_read;
  860|  52.7k|      sync_write_fn_t on_sync_write;
  861|  52.7k|      limits.v_start = cur_job_info->v_start;
  862|  52.7k|      limits.v_end = cur_job_info->v_end;
  863|  52.7k|      lr_unit_row = cur_job_info->lr_unit_row;
  864|  52.7k|      plane = cur_job_info->plane;
  865|  52.7k|      plane_w = ctxt[plane].plane_w;
  866|       |
  867|       |      // sync_mode == 1 implies only sync read is required in LR Multi-threading
  868|       |      // sync_mode == 0 implies only sync write is required.
  869|  52.7k|      on_sync_read =
  870|  52.7k|          cur_job_info->sync_mode == 1 ? lr_sync_read : av1_lr_sync_read_dummy;
  ------------------
  |  Branch (870:11): [True: 15.0k, False: 37.7k]
  ------------------
  871|  52.7k|      on_sync_write = cur_job_info->sync_mode == 0 ? lr_sync_write
  ------------------
  |  Branch (871:23): [True: 37.7k, False: 15.0k]
  ------------------
  872|  52.7k|                                                   : av1_lr_sync_write_dummy;
  873|       |
  874|  52.7k|      av1_foreach_rest_unit_in_row(
  875|  52.7k|          &limits, plane_w, lr_ctxt->on_rest_unit, lr_unit_row,
  876|  52.7k|          ctxt[plane].rsi->restoration_unit_size, ctxt[plane].rsi->horz_units,
  877|  52.7k|          ctxt[plane].rsi->vert_units, plane, &ctxt[plane],
  878|  52.7k|          lrworkerdata->rst_tmpbuf, lrworkerdata->rlbs, on_sync_read,
  879|  52.7k|          on_sync_write, lr_sync, error_info);
  880|       |
  881|  52.7k|      copy_funs[plane](lr_ctxt->dst, lr_ctxt->frame, 0, plane_w,
  882|  52.7k|                       cur_job_info->v_copy_start, cur_job_info->v_copy_end);
  883|       |
  884|  52.7k|      if (lrworkerdata->do_extend_border) {
  ------------------
  |  Branch (884:11): [True: 0, False: 52.7k]
  ------------------
  885|      0|        aom_extend_frame_borders_plane_row(lr_ctxt->frame, plane,
  ------------------
  |  |   34|      0|#define aom_extend_frame_borders_plane_row aom_extend_frame_borders_plane_row_c
  ------------------
  886|      0|                                           cur_job_info->v_copy_start,
  887|      0|                                           cur_job_info->v_copy_end);
  888|      0|      }
  889|   624k|    } else {
  890|   624k|      break;
  891|   624k|    }
  892|   676k|  }
  893|   625k|  error_info->setjmp = 0;
  894|   625k|  return 1;
  895|   625k|}
thread_common.c:get_lr_job_info:
  774|   677k|static AV1LrMTInfo *get_lr_job_info(AV1LrSync *lr_sync) {
  775|   677k|  AV1LrMTInfo *cur_job_info = NULL;
  776|       |
  777|   677k|#if CONFIG_MULTITHREAD
  778|   677k|  pthread_mutex_lock(lr_sync->job_mutex);
  779|       |
  780|   679k|  if (!lr_sync->lr_mt_exit && lr_sync->jobs_dequeued < lr_sync->jobs_enqueued) {
  ------------------
  |  Branch (780:7): [True: 679k, False: 18.4E]
  |  Branch (780:31): [True: 52.7k, False: 626k]
  ------------------
  781|  52.7k|    cur_job_info = lr_sync->job_queue + lr_sync->jobs_dequeued;
  782|  52.7k|    lr_sync->jobs_dequeued++;
  783|  52.7k|  }
  784|       |
  785|   677k|  pthread_mutex_unlock(lr_sync->job_mutex);
  786|       |#else
  787|       |  (void)lr_sync;
  788|       |#endif
  789|       |
  790|   677k|  return cur_job_info;
  791|   677k|}
thread_common.c:lr_sync_read:
  525|  47.8k|static inline void lr_sync_read(void *const lr_sync, int r, int c, int plane) {
  526|  47.8k|#if CONFIG_MULTITHREAD
  527|  47.8k|  AV1LrSync *const loop_res_sync = (AV1LrSync *)lr_sync;
  528|  47.8k|  const int nsync = loop_res_sync->sync_range;
  529|       |
  530|  47.8k|  if (r && !(c & (nsync - 1))) {
  ------------------
  |  Branch (530:7): [True: 47.8k, False: 5]
  |  Branch (530:12): [True: 47.8k, False: 18.4E]
  ------------------
  531|  47.8k|    pthread_mutex_t *const mutex = &loop_res_sync->mutex_[plane][r - 1];
  532|  47.8k|    pthread_mutex_lock(mutex);
  533|       |
  534|  66.2k|    while (c > loop_res_sync->cur_sb_col[plane][r - 1] - nsync) {
  ------------------
  |  Branch (534:12): [True: 18.3k, False: 47.8k]
  ------------------
  535|  18.3k|      pthread_cond_wait(&loop_res_sync->cond_[plane][r - 1], mutex);
  536|  18.3k|    }
  537|  47.8k|    pthread_mutex_unlock(mutex);
  538|  47.8k|  }
  539|       |#else
  540|       |  (void)lr_sync;
  541|       |  (void)r;
  542|       |  (void)c;
  543|       |  (void)plane;
  544|       |#endif  // CONFIG_MULTITHREAD
  545|  47.8k|}
thread_common.c:lr_sync_write:
  548|  67.2k|                                 const int sb_cols, int plane) {
  549|  67.2k|#if CONFIG_MULTITHREAD
  550|  67.2k|  AV1LrSync *const loop_res_sync = (AV1LrSync *)lr_sync;
  551|  67.2k|  const int nsync = loop_res_sync->sync_range;
  552|  67.2k|  int cur;
  553|       |  // Only signal when there are enough filtered SB for next row to run.
  554|  67.2k|  int sig = 1;
  555|       |
  556|  67.2k|  if (c < sb_cols - 1) {
  ------------------
  |  Branch (556:7): [True: 29.5k, False: 37.7k]
  ------------------
  557|  29.5k|    cur = c;
  558|  29.5k|    if (c % nsync) sig = 0;
  ------------------
  |  Branch (558:9): [True: 0, False: 29.5k]
  ------------------
  559|  37.7k|  } else {
  560|  37.7k|    cur = sb_cols + nsync;
  561|  37.7k|  }
  562|       |
  563|  67.2k|  if (sig) {
  ------------------
  |  Branch (563:7): [True: 67.2k, False: 4]
  ------------------
  564|  67.2k|    pthread_mutex_lock(&loop_res_sync->mutex_[plane][r]);
  565|       |
  566|       |    // When a thread encounters an error, cur_sb_col[plane][r] is set to maximum
  567|       |    // column number. In this case, the AOMMAX operation here ensures that
  568|       |    // cur_sb_col[plane][r] is not overwritten with a smaller value thus
  569|       |    // preventing the infinite waiting of threads in the relevant sync_read()
  570|       |    // function.
  571|  67.2k|    loop_res_sync->cur_sb_col[plane][r] =
  572|  67.2k|        AOMMAX(loop_res_sync->cur_sb_col[plane][r], cur);
  ------------------
  |  |   35|  67.2k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 67.2k]
  |  |  ------------------
  ------------------
  573|       |
  574|  67.2k|    pthread_cond_broadcast(&loop_res_sync->cond_[plane][r]);
  575|  67.2k|    pthread_mutex_unlock(&loop_res_sync->mutex_[plane][r]);
  576|  67.2k|  }
  577|       |#else
  578|       |  (void)lr_sync;
  579|       |  (void)r;
  580|       |  (void)c;
  581|       |  (void)sb_cols;
  582|       |  (void)plane;
  583|       |#endif  // CONFIG_MULTITHREAD
  584|  67.2k|}
thread_common.c:sync_lr_workers:
  898|  15.7k|                                   AV1_COMMON *const cm, int num_workers) {
  899|  15.7k|  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
  900|  15.7k|  int had_error = workers[0].had_error;
  901|  15.7k|  struct aom_internal_error_info error_info;
  902|       |
  903|       |  // Read the error_info of main thread.
  904|  15.7k|  if (had_error) {
  ------------------
  |  Branch (904:7): [True: 0, False: 15.7k]
  ------------------
  905|      0|    AVxWorker *const worker = &workers[0];
  906|      0|    error_info = ((LRWorkerData *)worker->data2)->error_info;
  907|      0|  }
  908|       |
  909|       |  // Wait till all rows are finished.
  910|   626k|  for (int i = num_workers - 1; i > 0; --i) {
  ------------------
  |  Branch (910:33): [True: 611k, False: 15.7k]
  ------------------
  911|   611k|    AVxWorker *const worker = &workers[i];
  912|   611k|    if (!winterface->sync(worker)) {
  ------------------
  |  Branch (912:9): [True: 0, False: 611k]
  ------------------
  913|      0|      had_error = 1;
  914|      0|      error_info = ((LRWorkerData *)worker->data2)->error_info;
  915|      0|    }
  916|   611k|  }
  917|  15.7k|  if (had_error) aom_internal_error_copy(cm->error, &error_info);
  ------------------
  |  Branch (917:7): [True: 0, False: 15.7k]
  ------------------
  918|  15.7k|}
thread_common.c:cdef_row_mt_sync_write:
  192|  68.5k|                                          int row) {
  193|  68.5k|#if CONFIG_MULTITHREAD
  194|  68.5k|  AV1CdefRowSync *const cdef_row_mt = cdef_sync->cdef_row_mt;
  195|  68.5k|  pthread_mutex_lock(cdef_row_mt[row].row_mutex_);
  196|  68.5k|  pthread_cond_signal(cdef_row_mt[row].row_cond_);
  197|  68.5k|  cdef_row_mt[row].is_row_done = 1;
  198|  68.5k|  pthread_mutex_unlock(cdef_row_mt[row].row_mutex_);
  199|       |#else
  200|       |  (void)cdef_sync;
  201|       |  (void)row;
  202|       |#endif  // CONFIG_MULTITHREAD
  203|  68.5k|}
thread_common.c:cdef_row_mt_sync_read:
  176|  69.4k|                                         int row) {
  177|  69.4k|  if (!row) return;
  ------------------
  |  Branch (177:7): [True: 19.3k, False: 50.1k]
  ------------------
  178|  50.1k|#if CONFIG_MULTITHREAD
  179|  50.1k|  AV1CdefRowSync *const cdef_row_mt = cdef_sync->cdef_row_mt;
  180|  50.1k|  pthread_mutex_lock(cdef_row_mt[row - 1].row_mutex_);
  181|  59.0k|  while (cdef_row_mt[row - 1].is_row_done != 1)
  ------------------
  |  Branch (181:10): [True: 8.93k, False: 50.1k]
  ------------------
  182|  8.93k|    pthread_cond_wait(cdef_row_mt[row - 1].row_cond_,
  183|  8.93k|                      cdef_row_mt[row - 1].row_mutex_);
  184|  50.1k|  cdef_row_mt[row - 1].is_row_done = 0;
  185|  50.1k|  pthread_mutex_unlock(cdef_row_mt[row - 1].row_mutex_);
  186|       |#else
  187|       |  (void)cdef_sync;
  188|       |#endif  // CONFIG_MULTITHREAD
  189|  50.1k|}
thread_common.c:reset_cdef_job_info:
 1000|  19.3k|static inline void reset_cdef_job_info(AV1CdefSync *const cdef_sync) {
 1001|  19.3k|  cdef_sync->end_of_frame = 0;
 1002|  19.3k|  cdef_sync->fbr = 0;
 1003|  19.3k|  cdef_sync->fbc = 0;
 1004|  19.3k|  cdef_sync->cdef_mt_exit = false;
 1005|  19.3k|}
thread_common.c:prepare_cdef_frame_workers:
 1139|  19.3k|    int do_extend_border) {
 1140|  19.3k|  const int num_planes = av1_num_planes(cm);
 1141|       |
 1142|  19.3k|  cdef_worker[0].srcbuf = cm->cdef_info.srcbuf;
 1143|  75.2k|  for (int plane = 0; plane < num_planes; plane++)
  ------------------
  |  Branch (1143:23): [True: 55.9k, False: 19.3k]
  ------------------
 1144|  55.9k|    cdef_worker[0].colbuf[plane] = cm->cdef_info.colbuf[plane];
 1145|   776k|  for (int i = num_workers - 1; i >= 0; i--) {
  ------------------
  |  Branch (1145:33): [True: 756k, False: 19.3k]
  ------------------
 1146|   756k|    AVxWorker *const worker = &workers[i];
 1147|   756k|    cdef_worker[i].cm = cm;
 1148|   756k|    cdef_worker[i].xd = xd;
 1149|   756k|    cdef_worker[i].cdef_init_fb_row_fn = cdef_init_fb_row_fn;
 1150|   756k|    cdef_worker[i].do_extend_border = do_extend_border;
 1151|  2.95M|    for (int plane = 0; plane < num_planes; plane++)
  ------------------
  |  Branch (1151:25): [True: 2.19M, False: 756k]
  ------------------
 1152|  2.19M|      cdef_worker[i].linebuf[plane] = cm->cdef_info.linebuf[plane];
 1153|       |
 1154|   756k|    worker->hook = hook;
 1155|   756k|    worker->data1 = cdef_sync;
 1156|   756k|    worker->data2 = &cdef_worker[i];
 1157|   756k|  }
 1158|  19.3k|}
thread_common.c:cdef_sb_row_worker_hook:
 1079|   755k|static int cdef_sb_row_worker_hook(void *arg1, void *arg2) {
 1080|   755k|  AV1CdefSync *const cdef_sync = (AV1CdefSync *)arg1;
 1081|   755k|  AV1CdefWorkerData *const cdef_worker = (AV1CdefWorkerData *)arg2;
 1082|   755k|  AV1_COMMON *cm = cdef_worker->cm;
 1083|   755k|  const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|   755k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   755k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
                const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
  ------------------
  |  |   58|   755k|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   755k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1084|       |
 1085|   755k|#if CONFIG_MULTITHREAD
 1086|   755k|  pthread_mutex_t *job_mutex_ = cdef_sync->mutex_;
 1087|   755k|#endif
 1088|   755k|  struct aom_internal_error_info *const error_info = &cdef_worker->error_info;
 1089|       |
 1090|       |  // The jmp_buf is valid only for the duration of the function that calls
 1091|       |  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
 1092|       |  // before it returns.
 1093|   755k|  if (setjmp(error_info->jmp)) {
 1094|      0|    error_info->setjmp = 0;
 1095|      0|#if CONFIG_MULTITHREAD
 1096|      0|    pthread_mutex_lock(job_mutex_);
 1097|      0|    cdef_sync->cdef_mt_exit = true;
 1098|      0|    pthread_mutex_unlock(job_mutex_);
 1099|      0|#endif
 1100|       |    // In case of cdef row-multithreading, the worker on a filter block row
 1101|       |    // (fbr) waits for the line buffers (top and bottom) copy of the above row.
 1102|       |    // Hence, in case a thread (main/worker) encounters an error before copying
 1103|       |    // of the line buffers, update that line buffer copy is complete in order to
 1104|       |    // avoid dependent workers waiting indefinitely.
 1105|      0|    set_cdef_init_fb_row_done(cdef_sync, nvfb);
 1106|      0|    return 0;
 1107|      0|  }
 1108|   755k|  error_info->setjmp = 1;
 1109|       |
 1110|   755k|  volatile int cur_fbr;
 1111|   755k|  const int num_planes = av1_num_planes(cm);
 1112|   825k|  while (get_cdef_row_next_job(cdef_sync, &cur_fbr, nvfb)) {
  ------------------
  |  Branch (1112:10): [True: 69.4k, False: 755k]
  ------------------
 1113|  69.4k|    MACROBLOCKD *xd = cdef_worker->xd;
 1114|  69.4k|    av1_cdef_fb_row(cm, xd, cdef_worker->linebuf, cdef_worker->colbuf,
 1115|  69.4k|                    cdef_worker->srcbuf, cur_fbr,
 1116|  69.4k|                    cdef_worker->cdef_init_fb_row_fn, cdef_sync, error_info);
 1117|  69.4k|    if (cdef_worker->do_extend_border) {
  ------------------
  |  Branch (1117:9): [True: 0, False: 69.4k]
  ------------------
 1118|      0|      for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (1118:27): [True: 0, False: 0]
  ------------------
 1119|      0|        const YV12_BUFFER_CONFIG *ybf = &cm->cur_frame->buf;
 1120|      0|        const int is_uv = plane > 0;
 1121|      0|        const int mi_high = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y;
  ------------------
  |  |   39|      0|#define MI_SIZE_LOG2 2
  ------------------
 1122|      0|        const int unit_height = MI_SIZE_64X64 << mi_high;
  ------------------
  |  |   58|      0|#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|      0|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
 1123|      0|        const int v_start = cur_fbr * unit_height;
 1124|      0|        const int v_end =
 1125|      0|            AOMMIN(v_start + unit_height, ybf->crop_heights[is_uv]);
  ------------------
  |  |   34|      0|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 0]
  |  |  ------------------
  ------------------
 1126|      0|        aom_extend_frame_borders_plane_row(ybf, plane, v_start, v_end);
  ------------------
  |  |   34|      0|#define aom_extend_frame_borders_plane_row aom_extend_frame_borders_plane_row_c
  ------------------
 1127|      0|      }
 1128|      0|    }
 1129|  69.4k|  }
 1130|   755k|  error_info->setjmp = 0;
 1131|   755k|  return 1;
 1132|   755k|}
thread_common.c:get_cdef_row_next_job:
 1056|   820k|                                        volatile int *cur_fbr, const int nvfb) {
 1057|   820k|#if CONFIG_MULTITHREAD
 1058|   820k|  pthread_mutex_lock(cdef_sync->mutex_);
 1059|   820k|#endif  // CONFIG_MULTITHREAD
 1060|   820k|  int do_next_row = 0;
 1061|       |  // Populates information needed for current job and update the row
 1062|       |  // index of the next row to be processed.
 1063|   826k|  if (!cdef_sync->cdef_mt_exit && cdef_sync->end_of_frame == 0) {
  ------------------
  |  Branch (1063:7): [True: 826k, False: 18.4E]
  |  Branch (1063:35): [True: 69.4k, False: 756k]
  ------------------
 1064|  69.4k|    do_next_row = 1;
 1065|  69.4k|    *cur_fbr = cdef_sync->fbr;
 1066|  69.4k|    update_cdef_row_next_job_info(cdef_sync, nvfb);
 1067|  69.4k|  }
 1068|   820k|#if CONFIG_MULTITHREAD
 1069|   820k|  pthread_mutex_unlock(cdef_sync->mutex_);
 1070|   820k|#endif  // CONFIG_MULTITHREAD
 1071|   820k|  return do_next_row;
 1072|   820k|}
thread_common.c:update_cdef_row_next_job_info:
 1046|  69.4k|                                          const int nvfb) {
 1047|  69.4k|  cdef_sync->fbr++;
 1048|  69.4k|  if (cdef_sync->fbr == nvfb) {
  ------------------
  |  Branch (1048:7): [True: 19.3k, False: 50.1k]
  ------------------
 1049|  19.3k|    cdef_sync->end_of_frame = 1;
 1050|  19.3k|  }
 1051|  69.4k|}
thread_common.c:launch_cdef_workers:
 1008|  19.3k|                                       int num_workers) {
 1009|  19.3k|  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
 1010|   776k|  for (int i = num_workers - 1; i >= 0; i--) {
  ------------------
  |  Branch (1010:33): [True: 756k, False: 19.3k]
  ------------------
 1011|   756k|    AVxWorker *const worker = &workers[i];
 1012|   756k|    worker->had_error = 0;
 1013|   756k|    if (i == 0)
  ------------------
  |  Branch (1013:9): [True: 19.3k, False: 737k]
  ------------------
 1014|  19.3k|      winterface->execute(worker);
 1015|   737k|    else
 1016|   737k|      winterface->launch(worker);
 1017|   756k|  }
 1018|  19.3k|}
thread_common.c:sync_cdef_workers:
 1021|  19.3k|                                     AV1_COMMON *const cm, int num_workers) {
 1022|  19.3k|  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
 1023|  19.3k|  int had_error = workers[0].had_error;
 1024|  19.3k|  struct aom_internal_error_info error_info;
 1025|       |
 1026|       |  // Read the error_info of main thread.
 1027|  19.3k|  if (had_error) {
  ------------------
  |  Branch (1027:7): [True: 0, False: 19.3k]
  ------------------
 1028|      0|    AVxWorker *const worker = &workers[0];
 1029|      0|    error_info = ((AV1CdefWorkerData *)worker->data2)->error_info;
 1030|      0|  }
 1031|       |
 1032|       |  // Wait till all rows are finished.
 1033|   756k|  for (int i = num_workers - 1; i > 0; --i) {
  ------------------
  |  Branch (1033:33): [True: 737k, False: 19.3k]
  ------------------
 1034|   737k|    AVxWorker *const worker = &workers[i];
 1035|   737k|    if (!winterface->sync(worker)) {
  ------------------
  |  Branch (1035:9): [True: 0, False: 737k]
  ------------------
 1036|      0|      had_error = 1;
 1037|      0|      error_info = ((AV1CdefWorkerData *)worker->data2)->error_info;
 1038|      0|    }
 1039|   737k|  }
 1040|  19.3k|  if (had_error) aom_internal_error_copy(cm->error, &error_info);
  ------------------
  |  Branch (1040:7): [True: 0, False: 19.3k]
  ------------------
 1041|  19.3k|}

thread_common.c:check_planes_to_loop_filter:
  336|  30.0k|                                              int plane_start, int plane_end) {
  337|  30.0k|  set_planes_to_loop_filter(lf, planes_to_lf, plane_start, plane_end);
  338|       |  // If the luma plane is purposely not filtered, neither are the chroma
  339|       |  // planes.
  340|  30.0k|  if (!planes_to_lf[0] && plane_start <= 0 && 0 < plane_end) return 0;
  ------------------
  |  Branch (340:7): [True: 0, False: 30.0k]
  |  Branch (340:27): [True: 0, False: 0]
  |  Branch (340:47): [True: 0, False: 0]
  ------------------
  341|       |  // Early exit.
  342|  30.0k|  if (!planes_to_lf[0] && !planes_to_lf[1] && !planes_to_lf[2]) return 0;
  ------------------
  |  Branch (342:7): [True: 0, False: 30.0k]
  |  Branch (342:27): [True: 0, False: 0]
  |  Branch (342:47): [True: 0, False: 0]
  ------------------
  343|  30.0k|  return 1;
  344|  30.0k|}
thread_common.c:set_planes_to_loop_filter:
  326|  30.0k|                                             int plane_start, int plane_end) {
  327|       |  // For each luma and chroma plane, whether to filter it or not.
  328|  30.0k|  planes_to_lf[0] = (lf->filter_level[0] || lf->filter_level[1]) &&
  ------------------
  |  Branch (328:22): [True: 23.6k, False: 6.32k]
  |  Branch (328:45): [True: 6.32k, False: 0]
  ------------------
  329|  30.0k|                    plane_start <= 0 && 0 < plane_end;
  ------------------
  |  Branch (329:21): [True: 30.0k, False: 0]
  |  Branch (329:41): [True: 30.0k, False: 0]
  ------------------
  330|  30.0k|  planes_to_lf[1] = lf->filter_level_u && plane_start <= 1 && 1 < plane_end;
  ------------------
  |  Branch (330:21): [True: 22.2k, False: 7.73k]
  |  Branch (330:43): [True: 22.2k, False: 0]
  |  Branch (330:63): [True: 22.2k, False: 7]
  ------------------
  331|  30.0k|  planes_to_lf[2] = lf->filter_level_v && plane_start <= 2 && 2 < plane_end;
  ------------------
  |  Branch (331:21): [True: 20.7k, False: 9.27k]
  |  Branch (331:43): [True: 20.7k, False: 0]
  |  Branch (331:63): [True: 20.7k, False: 7]
  ------------------
  332|  30.0k|}
thread_common.c:loop_filter_frame_mt_init:
  268|  22.7k|    int lpf_opt_level, int num_mis_in_lpf_unit_height_log2) {
  269|       |  // Number of superblock rows
  270|  22.7k|  const int sb_rows =
  271|  22.7k|      CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, num_mis_in_lpf_unit_height_log2);
  ------------------
  |  |   62|  22.7k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
  272|       |
  273|  22.7k|  if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
  ------------------
  |  Branch (273:7): [True: 1.81k, False: 20.9k]
  |  Branch (273:31): [True: 1.59k, False: 19.3k]
  ------------------
  274|  22.7k|      num_workers > lf_sync->num_workers) {
  ------------------
  |  Branch (274:7): [True: 0, False: 19.3k]
  ------------------
  275|  3.41k|    av1_loop_filter_dealloc(lf_sync);
  276|  3.41k|    av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
  277|  3.41k|  }
  278|  22.7k|  lf_sync->lf_mt_exit = false;
  279|       |
  280|       |  // Initialize cur_sb_col to -1 for all SB rows.
  281|  90.9k|  for (int i = 0; i < MAX_MB_PLANE; i++) {
  ------------------
  |  |   36|  90.9k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (281:19): [True: 68.1k, False: 22.7k]
  ------------------
  282|  68.1k|    memset(lf_sync->cur_sb_col[i], -1,
  283|  68.1k|           sizeof(*(lf_sync->cur_sb_col[i])) * sb_rows);
  284|  68.1k|  }
  285|       |
  286|  22.7k|  enqueue_lf_jobs(lf_sync, start_mi_row, end_mi_row, planes_to_lf,
  287|  22.7k|                  lpf_opt_level, (1 << num_mis_in_lpf_unit_height_log2));
  288|  22.7k|}
thread_common.c:enqueue_lf_jobs:
  238|  22.7k|                                   int num_mis_in_lpf_unit_height) {
  239|  22.7k|  int mi_row, plane, dir;
  240|  22.7k|  AV1LfMTInfo *lf_job_queue = lf_sync->job_queue;
  241|  22.7k|  lf_sync->jobs_enqueued = 0;
  242|  22.7k|  lf_sync->jobs_dequeued = 0;
  243|       |
  244|       |  // Launch all vertical jobs first, as they are blocking the horizontal ones.
  245|       |  // Launch top row jobs for all planes first, in case the output can be
  246|       |  // partially reconstructed row by row.
  247|  68.1k|  for (dir = 0; dir < 2; ++dir) {
  ------------------
  |  Branch (247:17): [True: 45.4k, False: 22.7k]
  ------------------
  248|   161k|    for (mi_row = start; mi_row < stop; mi_row += num_mis_in_lpf_unit_height) {
  ------------------
  |  Branch (248:26): [True: 116k, False: 45.4k]
  ------------------
  249|   464k|      for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
  ------------------
  |  |   36|   464k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (249:23): [True: 348k, False: 116k]
  ------------------
  250|   348k|        if (skip_loop_filter_plane(planes_to_lf, plane, lpf_opt_level)) {
  ------------------
  |  Branch (250:13): [True: 64.9k, False: 283k]
  ------------------
  251|  64.9k|          continue;
  252|  64.9k|        }
  253|   283k|        if (!planes_to_lf[plane]) continue;
  ------------------
  |  Branch (253:13): [True: 0, False: 283k]
  ------------------
  254|   283k|        lf_job_queue->mi_row = mi_row;
  255|   283k|        lf_job_queue->plane = plane;
  256|   283k|        lf_job_queue->dir = dir;
  257|   283k|        lf_job_queue->lpf_opt_level = lpf_opt_level;
  258|   283k|        lf_job_queue++;
  259|   283k|        lf_sync->jobs_enqueued++;
  260|   283k|      }
  261|   116k|    }
  262|  45.4k|  }
  263|  22.7k|}
thread_common.c:skip_loop_filter_plane:
  213|   387k|    const int planes_to_lf[MAX_MB_PLANE], int plane, int lpf_opt_level) {
  214|       |  // If LPF_PICK_METHOD is LPF_PICK_FROM_Q, we have the option to filter both
  215|       |  // chroma planes together
  216|   387k|  if (lpf_opt_level == 2) {
  ------------------
  |  Branch (216:7): [True: 0, False: 387k]
  ------------------
  217|      0|    if (plane == AOM_PLANE_Y) {
  ------------------
  |  |  226|      0|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  |  Branch (217:9): [True: 0, False: 0]
  ------------------
  218|      0|      return !planes_to_lf[plane];
  219|      0|    }
  220|      0|    if (plane == AOM_PLANE_U) {
  ------------------
  |  |  227|      0|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
  |  Branch (220:9): [True: 0, False: 0]
  ------------------
  221|       |      // U and V are handled together
  222|      0|      return !planes_to_lf[1] && !planes_to_lf[2];
  ------------------
  |  Branch (222:14): [True: 0, False: 0]
  |  Branch (222:34): [True: 0, False: 0]
  ------------------
  223|      0|    }
  224|      0|    assert(plane == AOM_PLANE_V);
  225|      0|    if (plane == AOM_PLANE_V) {
  ------------------
  |  |  228|      0|#define AOM_PLANE_V 2      /**< V (Chroma) plane */
  ------------------
  |  Branch (225:9): [True: 0, False: 0]
  ------------------
  226|       |      // V is handled when u is filtered
  227|      0|      return true;
  228|      0|    }
  229|      0|  }
  230|       |
  231|       |  // Normal operation mode
  232|   387k|  return !planes_to_lf[plane];
  233|   387k|}
thread_common.c:get_lf_job_info:
  290|  1.20M|static inline AV1LfMTInfo *get_lf_job_info(AV1LfSync *lf_sync) {
  291|  1.20M|  AV1LfMTInfo *cur_job_info = NULL;
  292|       |
  293|  1.20M|#if CONFIG_MULTITHREAD
  294|  1.20M|  pthread_mutex_lock(lf_sync->job_mutex);
  295|       |
  296|  1.22M|  if (!lf_sync->lf_mt_exit && lf_sync->jobs_dequeued < lf_sync->jobs_enqueued) {
  ------------------
  |  Branch (296:7): [True: 1.22M, False: 18.4E]
  |  Branch (296:31): [True: 283k, False: 938k]
  ------------------
  297|   283k|    cur_job_info = lf_sync->job_queue + lf_sync->jobs_dequeued;
  298|   283k|    lf_sync->jobs_dequeued++;
  299|   283k|  }
  300|       |
  301|  1.20M|  pthread_mutex_unlock(lf_sync->job_mutex);
  302|       |#else
  303|       |  (void)lf_sync;
  304|       |#endif
  305|       |
  306|  1.20M|  return cur_job_info;
  307|  1.20M|}
thread_common.c:loop_filter_data_reset:
  312|   938k|                                          MACROBLOCKD *xd) {
  313|   938k|  struct macroblockd_plane *pd = xd->plane;
  314|   938k|  lf_data->frame_buffer = frame_buffer;
  315|   938k|  lf_data->cm = cm;
  316|   938k|  lf_data->xd = xd;
  317|  3.75M|  for (int i = 0; i < MAX_MB_PLANE; i++) {
  ------------------
  |  |   36|  3.75M|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (317:19): [True: 2.81M, False: 938k]
  ------------------
  318|  2.81M|    lf_data->planes[i].dst = pd[i].dst;
  319|  2.81M|    lf_data->planes[i].subsampling_x = pd[i].subsampling_x;
  320|  2.81M|    lf_data->planes[i].subsampling_y = pd[i].subsampling_y;
  321|  2.81M|  }
  322|   938k|}

av1_tile_init:
   19|   380k|void av1_tile_init(TileInfo *tile, const AV1_COMMON *cm, int row, int col) {
   20|   380k|  av1_tile_set_row(tile, cm, row);
   21|   380k|  av1_tile_set_col(tile, cm, col);
   22|   380k|}
av1_get_tile_limits:
   32|   190k|void av1_get_tile_limits(AV1_COMMON *const cm) {
   33|   190k|  const SequenceHeader *const seq_params = cm->seq_params;
   34|   190k|  CommonTileParams *const tiles = &cm->tiles;
   35|   190k|  const int sb_cols =
   36|   190k|      CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2);
  ------------------
  |  |   62|   190k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
   37|   190k|  const int sb_rows =
   38|   190k|      CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, seq_params->mib_size_log2);
  ------------------
  |  |   62|   190k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
   39|       |
   40|   190k|  const int sb_size_log2 = seq_params->mib_size_log2 + MI_SIZE_LOG2;
  ------------------
  |  |   39|   190k|#define MI_SIZE_LOG2 2
  ------------------
   41|   190k|  tiles->max_width_sb = MAX_TILE_WIDTH >> sb_size_log2;
  ------------------
  |  |   50|   190k|#define MAX_TILE_WIDTH (4096)        // Max Tile width in pixels
  ------------------
   42|       |
   43|       |#if CONFIG_CWG_C013
   44|       |  bool use_level_7_above = false;
   45|       |  for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) {
   46|       |    if (seq_params->seq_level_idx[i] >= SEQ_LEVEL_7_0 &&
   47|       |        seq_params->seq_level_idx[i] <= SEQ_LEVEL_8_3) {
   48|       |      // Currently it is assumed that levels 7.x and 8.x are either used for all
   49|       |      // operating points, or none of them.
   50|       |      if (i != 0 && !use_level_7_above) {
   51|       |        aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
   52|       |                           "Either all the operating points are levels 7.x or "
   53|       |                           "8.x, or none of them are.");
   54|       |      }
   55|       |      use_level_7_above = true;
   56|       |    }
   57|       |  }
   58|       |  const int max_tile_area_sb =
   59|       |      (use_level_7_above ? MAX_TILE_AREA_LEVEL_7_AND_ABOVE : MAX_TILE_AREA) >>
   60|       |      (2 * sb_size_log2);
   61|       |#else
   62|   190k|  const int max_tile_area_sb = MAX_TILE_AREA >> (2 * sb_size_log2);
  ------------------
  |  |   51|   190k|#define MAX_TILE_AREA (4096 * 2304)  // Maximum tile area in pixels
  ------------------
   63|   190k|#endif
   64|       |
   65|   190k|  tiles->min_log2_cols = tile_log2(tiles->max_width_sb, sb_cols);
   66|   190k|  tiles->max_log2_cols = tile_log2(1, AOMMIN(sb_cols, MAX_TILE_COLS));
  ------------------
  |  |   34|   190k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 184k, False: 5.48k]
  |  |  ------------------
  ------------------
   67|   190k|  tiles->max_log2_rows = tile_log2(1, AOMMIN(sb_rows, MAX_TILE_ROWS));
  ------------------
  |  |   34|   190k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 188k, False: 2.29k]
  |  |  ------------------
  ------------------
   68|   190k|  tiles->min_log2 = tile_log2(max_tile_area_sb, sb_cols * sb_rows);
   69|   190k|  tiles->min_log2 = AOMMAX(tiles->min_log2, tiles->min_log2_cols);
  ------------------
  |  |   35|   190k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 907, False: 189k]
  |  |  ------------------
  ------------------
   70|   190k|}
av1_calculate_tile_cols:
   74|   190k|                             CommonTileParams *const tiles) {
   75|   190k|  int sb_cols = CEIL_POWER_OF_TWO(cm_mi_cols, seq_params->mib_size_log2);
  ------------------
  |  |   62|   190k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
   76|   190k|  int sb_rows = CEIL_POWER_OF_TWO(cm_mi_rows, seq_params->mib_size_log2);
  ------------------
  |  |   62|   190k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
   77|   190k|  int i;
   78|       |
   79|       |  // This will be overridden if there is at least two columns of tiles
   80|       |  // (otherwise there is no inner tile width)
   81|   190k|  tiles->min_inner_width = -1;
   82|       |
   83|   190k|  if (tiles->uniform_spacing) {
  ------------------
  |  Branch (83:7): [True: 100k, False: 90.3k]
  ------------------
   84|   100k|    int start_sb;
   85|   100k|    int size_sb = CEIL_POWER_OF_TWO(sb_cols, tiles->log2_cols);
  ------------------
  |  |   62|   100k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
   86|   100k|    assert(size_sb > 0);
   87|   247k|    for (i = 0, start_sb = 0; start_sb < sb_cols; i++) {
  ------------------
  |  Branch (87:31): [True: 147k, False: 100k]
  ------------------
   88|   147k|      tiles->col_start_sb[i] = start_sb;
   89|   147k|      start_sb += size_sb;
   90|   147k|    }
   91|   100k|    tiles->cols = i;
   92|   100k|    tiles->col_start_sb[i] = sb_cols;
   93|   100k|    tiles->min_log2_rows = AOMMAX(tiles->min_log2 - tiles->log2_cols, 0);
  ------------------
  |  |   35|   100k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 118, False: 99.9k]
  |  |  ------------------
  ------------------
   94|   100k|    tiles->max_height_sb = sb_rows >> tiles->min_log2_rows;
   95|       |
   96|   100k|    tiles->width = size_sb << seq_params->mib_size_log2;
   97|   100k|    tiles->width = AOMMIN(tiles->width, cm_mi_cols);
  ------------------
  |  |   34|   100k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 9.45k, False: 90.5k]
  |  |  ------------------
  ------------------
   98|   100k|    if (tiles->cols > 1) {
  ------------------
  |  Branch (98:9): [True: 9.45k, False: 90.5k]
  ------------------
   99|  9.45k|      tiles->min_inner_width = tiles->width;
  100|  9.45k|    }
  101|   100k|  } else {
  102|  90.3k|    int max_tile_area_sb = (sb_rows * sb_cols);
  103|  90.3k|    int widest_tile_sb = 1;
  104|  90.3k|    int narrowest_inner_tile_sb = 65536;
  105|  90.3k|    tiles->log2_cols = tile_log2(1, tiles->cols);
  106|   214k|    for (i = 0; i < tiles->cols; i++) {
  ------------------
  |  Branch (106:17): [True: 123k, False: 90.3k]
  ------------------
  107|   123k|      int size_sb = tiles->col_start_sb[i + 1] - tiles->col_start_sb[i];
  108|   123k|      widest_tile_sb = AOMMAX(widest_tile_sb, size_sb);
  ------------------
  |  |   35|   123k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 15.6k, False: 108k]
  |  |  ------------------
  ------------------
  109|       |      // ignore the rightmost tile in frame for determining the narrowest
  110|   123k|      if (i < tiles->cols - 1)
  ------------------
  |  Branch (110:11): [True: 33.2k, False: 90.3k]
  ------------------
  111|  33.2k|        narrowest_inner_tile_sb = AOMMIN(narrowest_inner_tile_sb, size_sb);
  ------------------
  |  |   34|  33.2k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 7.54k, False: 25.7k]
  |  |  ------------------
  ------------------
  112|   123k|    }
  113|  90.3k|    if (tiles->min_log2) {
  ------------------
  |  Branch (113:9): [True: 1.85k, False: 88.5k]
  ------------------
  114|  1.85k|      max_tile_area_sb >>= (tiles->min_log2 + 1);
  115|  1.85k|    }
  116|  90.3k|    tiles->max_height_sb = AOMMAX(max_tile_area_sb / widest_tile_sb, 1);
  ------------------
  |  |   35|  90.3k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 31.6k, False: 58.7k]
  |  |  ------------------
  ------------------
  117|  90.3k|    if (tiles->cols > 1) {
  ------------------
  |  Branch (117:9): [True: 11.5k, False: 78.8k]
  ------------------
  118|  11.5k|      tiles->min_inner_width = narrowest_inner_tile_sb
  119|  11.5k|                               << seq_params->mib_size_log2;
  120|  11.5k|    }
  121|  90.3k|  }
  122|   190k|}
av1_calculate_tile_rows:
  125|   190k|                             int cm_mi_rows, CommonTileParams *const tiles) {
  126|   190k|  int sb_rows = CEIL_POWER_OF_TWO(cm_mi_rows, seq_params->mib_size_log2);
  ------------------
  |  |   62|   190k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
  127|   190k|  int start_sb, size_sb, i;
  128|       |
  129|   190k|  if (tiles->uniform_spacing) {
  ------------------
  |  Branch (129:7): [True: 100k, False: 90.2k]
  ------------------
  130|   100k|    size_sb = CEIL_POWER_OF_TWO(sb_rows, tiles->log2_rows);
  ------------------
  |  |   62|   100k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
  131|   100k|    assert(size_sb > 0);
  132|   232k|    for (i = 0, start_sb = 0; start_sb < sb_rows; i++) {
  ------------------
  |  Branch (132:31): [True: 131k, False: 100k]
  ------------------
  133|   131k|      tiles->row_start_sb[i] = start_sb;
  134|   131k|      start_sb += size_sb;
  135|   131k|    }
  136|   100k|    tiles->rows = i;
  137|   100k|    tiles->row_start_sb[i] = sb_rows;
  138|       |
  139|   100k|    tiles->height = size_sb << seq_params->mib_size_log2;
  140|   100k|    tiles->height = AOMMIN(tiles->height, cm_mi_rows);
  ------------------
  |  |   34|   100k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 6.58k, False: 93.4k]
  |  |  ------------------
  ------------------
  141|   100k|  } else {
  142|  90.2k|    tiles->log2_rows = tile_log2(1, tiles->rows);
  143|  90.2k|  }
  144|   190k|}
av1_tile_set_row:
  146|   456k|void av1_tile_set_row(TileInfo *tile, const AV1_COMMON *cm, int row) {
  147|   456k|  assert(row < cm->tiles.rows);
  148|   456k|  int mi_row_start = cm->tiles.row_start_sb[row]
  149|   456k|                     << cm->seq_params->mib_size_log2;
  150|   456k|  int mi_row_end = cm->tiles.row_start_sb[row + 1]
  151|   456k|                   << cm->seq_params->mib_size_log2;
  152|   456k|  tile->tile_row = row;
  153|   456k|  tile->mi_row_start = mi_row_start;
  154|   456k|  tile->mi_row_end = AOMMIN(mi_row_end, cm->mi_params.mi_rows);
  ------------------
  |  |   34|   456k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 42.1k, False: 414k]
  |  |  ------------------
  ------------------
  155|   456k|  assert(tile->mi_row_end > tile->mi_row_start);
  156|   456k|}
av1_tile_set_col:
  158|   502k|void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) {
  159|   502k|  assert(col < cm->tiles.cols);
  160|   502k|  int mi_col_start = cm->tiles.col_start_sb[col]
  161|   502k|                     << cm->seq_params->mib_size_log2;
  162|   502k|  int mi_col_end = cm->tiles.col_start_sb[col + 1]
  163|   502k|                   << cm->seq_params->mib_size_log2;
  164|   502k|  tile->tile_col = col;
  165|   502k|  tile->mi_col_start = mi_col_start;
  166|   502k|  tile->mi_col_end = AOMMIN(mi_col_end, cm->mi_params.mi_cols);
  ------------------
  |  |   34|   502k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 21.4k, False: 480k]
  |  |  ------------------
  ------------------
  167|   502k|  assert(tile->mi_col_end > tile->mi_col_start);
  168|   502k|}
av1_get_sb_rows_in_tile:
  170|   400k|int av1_get_sb_rows_in_tile(const AV1_COMMON *cm, const TileInfo *tile) {
  171|   400k|  return CEIL_POWER_OF_TWO(tile->mi_row_end - tile->mi_row_start,
  ------------------
  |  |   62|   400k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
  172|   400k|                           cm->seq_params->mib_size_log2);
  173|   400k|}
av1_get_sb_cols_in_tile:
  175|   127k|int av1_get_sb_cols_in_tile(const AV1_COMMON *cm, const TileInfo *tile) {
  176|   127k|  return CEIL_POWER_OF_TWO(tile->mi_col_end - tile->mi_col_start,
  ------------------
  |  |   62|   127k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
  177|   127k|                           cm->seq_params->mib_size_log2);
  178|   127k|}
av1_get_uniform_tile_size:
  189|  7.49k|bool av1_get_uniform_tile_size(const AV1_COMMON *cm, int *w, int *h) {
  190|  7.49k|  const CommonTileParams *const tiles = &cm->tiles;
  191|  7.49k|  if (tiles->uniform_spacing) {
  ------------------
  |  Branch (191:7): [True: 5.06k, False: 2.42k]
  ------------------
  192|  5.06k|    *w = tiles->width;
  193|  5.06k|    *h = tiles->height;
  194|  5.06k|  } else {
  195|  6.85k|    for (int i = 0; i < tiles->cols; ++i) {
  ------------------
  |  Branch (195:21): [True: 5.17k, False: 1.68k]
  ------------------
  196|  5.17k|      const int tile_width_sb =
  197|  5.17k|          tiles->col_start_sb[i + 1] - tiles->col_start_sb[i];
  198|  5.17k|      const int tile_w = tile_width_sb * cm->seq_params->mib_size;
  199|       |      // ensure all tiles have same dimension
  200|  5.17k|      if (i != 0 && tile_w != *w) {
  ------------------
  |  Branch (200:11): [True: 2.74k, False: 2.42k]
  |  Branch (200:21): [True: 745, False: 2.00k]
  ------------------
  201|    745|        return false;
  202|    745|      }
  203|  4.43k|      *w = tile_w;
  204|  4.43k|    }
  205|       |
  206|  5.52k|    for (int i = 0; i < tiles->rows; ++i) {
  ------------------
  |  Branch (206:21): [True: 4.30k, False: 1.22k]
  ------------------
  207|  4.30k|      const int tile_height_sb =
  208|  4.30k|          tiles->row_start_sb[i + 1] - tiles->row_start_sb[i];
  209|  4.30k|      const int tile_h = tile_height_sb * cm->seq_params->mib_size;
  210|       |      // ensure all tiles have same dimension
  211|  4.30k|      if (i != 0 && tile_h != *h) {
  ------------------
  |  Branch (211:11): [True: 2.61k, False: 1.68k]
  |  Branch (211:21): [True: 455, False: 2.16k]
  ------------------
  212|    455|        return false;
  213|    455|      }
  214|  3.84k|      *h = tile_h;
  215|  3.84k|    }
  216|  1.68k|  }
  217|  6.29k|  return true;
  218|  7.49k|}
av1_is_min_tile_width_satisfied:
  220|   189k|int av1_is_min_tile_width_satisfied(const AV1_COMMON *cm) {
  221|       |  // Disable check if there is a single tile col in the frame
  222|   189k|  if (cm->tiles.cols == 1) return 1;
  ------------------
  |  Branch (222:7): [True: 169k, False: 19.8k]
  ------------------
  223|       |
  224|  19.8k|  return ((cm->tiles.min_inner_width << MI_SIZE_LOG2) >=
  ------------------
  |  |   39|  19.8k|#define MI_SIZE_LOG2 2
  ------------------
  225|  19.8k|          (64 << av1_superres_scaled(cm)));
  226|   189k|}
tile_common.c:tile_log2:
   25|   942k|static int tile_log2(int blk_size, int target) {
   26|   942k|  int k;
   27|  1.39M|  for (k = 0; (blk_size << k) < target; k++) {
  ------------------
  |  Branch (27:15): [True: 450k, False: 942k]
  ------------------
   28|   450k|  }
   29|   942k|  return k;
   30|   942k|}

av1_max_level_bitrate:
   54|  4.33k|                              int seq_tier) {
   55|  4.33k|  int64_t bitrate;
   56|       |
   57|  4.33k|  if (seq_tier) {
  ------------------
  |  Branch (57:7): [True: 1.84k, False: 2.48k]
  ------------------
   58|  1.84k|    bitrate = high_kbps[seq_level_idx] * bitrate_profile_factor[seq_profile];
   59|  2.48k|  } else {
   60|  2.48k|    bitrate = main_kbps[seq_level_idx] * bitrate_profile_factor[seq_profile];
   61|  2.48k|  }
   62|       |
   63|  4.33k|  return bitrate * 1000;
   64|  4.33k|}

decodetxb.c:get_txb_ctx:
  450|  36.1M|                               TXB_CTX *const txb_ctx) {
  451|  36.1M|  switch (tx_size) {
  452|  10.4M|    case TX_4X4: get_txb_ctx_4x4(plane_bsize, plane, a, l, txb_ctx); break;
  ------------------
  |  Branch (452:5): [True: 10.4M, False: 25.7M]
  ------------------
  453|  5.22M|    case TX_8X8: get_txb_ctx_8x8(plane_bsize, plane, a, l, txb_ctx); break;
  ------------------
  |  Branch (453:5): [True: 5.22M, False: 30.9M]
  ------------------
  454|  3.08M|    case TX_16X16: get_txb_ctx_16x16(plane_bsize, plane, a, l, txb_ctx); break;
  ------------------
  |  Branch (454:5): [True: 3.08M, False: 33.0M]
  ------------------
  455|  2.18M|    case TX_32X32: get_txb_ctx_32x32(plane_bsize, plane, a, l, txb_ctx); break;
  ------------------
  |  Branch (455:5): [True: 2.18M, False: 33.9M]
  ------------------
  456|  15.2M|    default:
  ------------------
  |  Branch (456:5): [True: 15.2M, False: 20.9M]
  ------------------
  457|  15.2M|      get_txb_ctx_general(plane_bsize, tx_size, plane, a, l, txb_ctx);
  458|  15.2M|      break;
  459|  36.1M|  }
  460|  36.1M|}
decodetxb.c:get_txb_ctx_4x4:
  373|  10.4M|      TXB_CTX *const txb_ctx) {                                               \
  374|  10.4M|    static const int8_t signs[3] = { 0, -1, 1 };                              \
  375|  10.4M|    static const int8_t dc_sign_contexts[4 * MAX_TX_SIZE_UNIT + 1] = {        \
  376|  10.4M|      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,       \
  377|  10.4M|      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,       \
  378|  10.4M|      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2           \
  379|  10.4M|    };                                                                        \
  380|  10.4M|    const TX_SIZE tx_size = TX_##w##X##h;                                     \
  381|  10.4M|    const int txb_w_unit = tx_size_wide_unit[tx_size];                        \
  382|  10.4M|    const int txb_h_unit = tx_size_high_unit[tx_size];                        \
  383|  10.4M|    int dc_sign = 0;                                                          \
  384|  10.4M|    int k = 0;                                                                \
  385|  10.4M|                                                                              \
  386|  10.4M|    do {                                                                      \
  387|  10.4M|      const unsigned int sign = ((uint8_t)a[k]) >> COEFF_CONTEXT_BITS;        \
  ------------------
  |  |   51|  10.4M|#define COEFF_CONTEXT_BITS 3
  ------------------
  388|  10.4M|      assert(sign <= 2);                                                      \
  389|  10.4M|      dc_sign += signs[sign];                                                 \
  390|  10.4M|    } while (++k < txb_w_unit);                                               \
  ------------------
  |  Branch (390:14): [True: 0, False: 10.4M]
  ------------------
  391|  10.4M|                                                                              \
  392|  10.4M|    k = 0;                                                                    \
  393|  10.4M|    do {                                                                      \
  394|  10.4M|      const unsigned int sign = ((uint8_t)l[k]) >> COEFF_CONTEXT_BITS;        \
  ------------------
  |  |   51|  10.4M|#define COEFF_CONTEXT_BITS 3
  ------------------
  395|  10.4M|      assert(sign <= 2);                                                      \
  396|  10.5M|      dc_sign += signs[sign];                                                 \
  397|  10.5M|    } while (++k < txb_h_unit);                                               \
  ------------------
  |  Branch (397:14): [True: 0, False: 10.5M]
  ------------------
  398|  10.4M|                                                                              \
  399|  10.5M|    txb_ctx->dc_sign_ctx = dc_sign_contexts[dc_sign + 2 * MAX_TX_SIZE_UNIT];  \
  ------------------
  |  |  286|  10.5M|#define MAX_TX_SIZE_UNIT 16
  ------------------
  400|  10.5M|                                                                              \
  401|  10.5M|    if (plane == 0) {                                                         \
  ------------------
  |  Branch (401:9): [True: 4.93M, False: 5.60M]
  ------------------
  402|  4.93M|      if (plane_bsize == txsize_to_bsize[tx_size]) {                          \
  ------------------
  |  Branch (402:11): [True: 365k, False: 4.56M]
  ------------------
  403|   365k|        txb_ctx->txb_skip_ctx = 0;                                            \
  404|  4.56M|      } else {                                                                \
  405|  4.56M|        static const uint8_t skip_contexts[5][5] = { { 1, 2, 2, 2, 3 },       \
  406|  4.56M|                                                     { 2, 4, 4, 4, 5 },       \
  407|  4.56M|                                                     { 2, 4, 4, 4, 5 },       \
  408|  4.56M|                                                     { 2, 4, 4, 4, 5 },       \
  409|  4.56M|                                                     { 3, 5, 5, 5, 6 } };     \
  410|  4.56M|        int top = 0;                                                          \
  411|  4.56M|        int left = 0;                                                         \
  412|  4.56M|                                                                              \
  413|  4.56M|        k = 0;                                                                \
  414|  4.56M|        do {                                                                  \
  415|  4.56M|          top |= a[k];                                                        \
  416|  4.56M|        } while (++k < txb_w_unit);                                           \
  ------------------
  |  Branch (416:18): [True: 0, False: 4.56M]
  ------------------
  417|  4.56M|        top &= COEFF_CONTEXT_MASK;                                            \
  ------------------
  |  |   52|  4.56M|#define COEFF_CONTEXT_MASK ((1 << COEFF_CONTEXT_BITS) - 1)
  |  |  ------------------
  |  |  |  |   51|  4.56M|#define COEFF_CONTEXT_BITS 3
  |  |  ------------------
  ------------------
  418|  4.56M|        top = AOMMIN(top, 4);                                                 \
  ------------------
  |  |   34|  4.56M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 3.35M, False: 1.21M]
  |  |  ------------------
  ------------------
  419|  4.56M|                                                                              \
  420|  4.56M|        k = 0;                                                                \
  421|  4.56M|        do {                                                                  \
  422|  4.56M|          left |= l[k];                                                       \
  423|  4.56M|        } while (++k < txb_h_unit);                                           \
  ------------------
  |  Branch (423:18): [True: 0, False: 4.56M]
  ------------------
  424|  4.56M|        left &= COEFF_CONTEXT_MASK;                                           \
  ------------------
  |  |   52|  4.56M|#define COEFF_CONTEXT_MASK ((1 << COEFF_CONTEXT_BITS) - 1)
  |  |  ------------------
  |  |  |  |   51|  4.56M|#define COEFF_CONTEXT_BITS 3
  |  |  ------------------
  ------------------
  425|  4.56M|        left = AOMMIN(left, 4);                                               \
  ------------------
  |  |   34|  4.56M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 3.30M, False: 1.25M]
  |  |  ------------------
  ------------------
  426|  4.56M|                                                                              \
  427|  4.56M|        txb_ctx->txb_skip_ctx = skip_contexts[top][left];                     \
  428|  4.56M|      }                                                                       \
  429|  5.60M|    } else {                                                                  \
  430|  5.60M|      const int ctx_base = get_entropy_context(tx_size, a, l);                \
  431|  5.60M|      const int ctx_offset = (num_pels_log2_lookup[plane_bsize] >             \
  ------------------
  |  Branch (431:30): [True: 3.20M, False: 2.40M]
  ------------------
  432|  5.60M|                              num_pels_log2_lookup[txsize_to_bsize[tx_size]]) \
  433|  5.60M|                                 ? 10                                         \
  434|  5.60M|                                 : 7;                                         \
  435|  5.60M|      txb_ctx->txb_skip_ctx = ctx_base + ctx_offset;                          \
  436|  5.60M|    }                                                                         \
  437|  10.5M|  }
decodetxb.c:get_txb_ctx_8x8:
  373|  5.22M|      TXB_CTX *const txb_ctx) {                                               \
  374|  5.22M|    static const int8_t signs[3] = { 0, -1, 1 };                              \
  375|  5.22M|    static const int8_t dc_sign_contexts[4 * MAX_TX_SIZE_UNIT + 1] = {        \
  376|  5.22M|      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,       \
  377|  5.22M|      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,       \
  378|  5.22M|      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2           \
  379|  5.22M|    };                                                                        \
  380|  5.22M|    const TX_SIZE tx_size = TX_##w##X##h;                                     \
  381|  5.22M|    const int txb_w_unit = tx_size_wide_unit[tx_size];                        \
  382|  5.22M|    const int txb_h_unit = tx_size_high_unit[tx_size];                        \
  383|  5.22M|    int dc_sign = 0;                                                          \
  384|  5.22M|    int k = 0;                                                                \
  385|  5.22M|                                                                              \
  386|  10.4M|    do {                                                                      \
  387|  10.4M|      const unsigned int sign = ((uint8_t)a[k]) >> COEFF_CONTEXT_BITS;        \
  ------------------
  |  |   51|  10.4M|#define COEFF_CONTEXT_BITS 3
  ------------------
  388|  10.4M|      assert(sign <= 2);                                                      \
  389|  10.4M|      dc_sign += signs[sign];                                                 \
  390|  10.4M|    } while (++k < txb_w_unit);                                               \
  ------------------
  |  Branch (390:14): [True: 5.22M, False: 5.22M]
  ------------------
  391|  5.22M|                                                                              \
  392|  5.22M|    k = 0;                                                                    \
  393|  10.4M|    do {                                                                      \
  394|  10.4M|      const unsigned int sign = ((uint8_t)l[k]) >> COEFF_CONTEXT_BITS;        \
  ------------------
  |  |   51|  10.4M|#define COEFF_CONTEXT_BITS 3
  ------------------
  395|  10.4M|      assert(sign <= 2);                                                      \
  396|  10.4M|      dc_sign += signs[sign];                                                 \
  397|  10.4M|    } while (++k < txb_h_unit);                                               \
  ------------------
  |  Branch (397:14): [True: 5.22M, False: 5.22M]
  ------------------
  398|  5.22M|                                                                              \
  399|  5.22M|    txb_ctx->dc_sign_ctx = dc_sign_contexts[dc_sign + 2 * MAX_TX_SIZE_UNIT];  \
  ------------------
  |  |  286|  5.22M|#define MAX_TX_SIZE_UNIT 16
  ------------------
  400|  5.22M|                                                                              \
  401|  5.22M|    if (plane == 0) {                                                         \
  ------------------
  |  Branch (401:9): [True: 2.61M, False: 2.61M]
  ------------------
  402|  2.61M|      if (plane_bsize == txsize_to_bsize[tx_size]) {                          \
  ------------------
  |  Branch (402:11): [True: 1.08M, False: 1.53M]
  ------------------
  403|  1.08M|        txb_ctx->txb_skip_ctx = 0;                                            \
  404|  1.53M|      } else {                                                                \
  405|  1.53M|        static const uint8_t skip_contexts[5][5] = { { 1, 2, 2, 2, 3 },       \
  406|  1.53M|                                                     { 2, 4, 4, 4, 5 },       \
  407|  1.53M|                                                     { 2, 4, 4, 4, 5 },       \
  408|  1.53M|                                                     { 2, 4, 4, 4, 5 },       \
  409|  1.53M|                                                     { 3, 5, 5, 5, 6 } };     \
  410|  1.53M|        int top = 0;                                                          \
  411|  1.53M|        int left = 0;                                                         \
  412|  1.53M|                                                                              \
  413|  1.53M|        k = 0;                                                                \
  414|  3.06M|        do {                                                                  \
  415|  3.06M|          top |= a[k];                                                        \
  416|  3.06M|        } while (++k < txb_w_unit);                                           \
  ------------------
  |  Branch (416:18): [True: 1.53M, False: 1.53M]
  ------------------
  417|  1.53M|        top &= COEFF_CONTEXT_MASK;                                            \
  ------------------
  |  |   52|  1.53M|#define COEFF_CONTEXT_MASK ((1 << COEFF_CONTEXT_BITS) - 1)
  |  |  ------------------
  |  |  |  |   51|  1.53M|#define COEFF_CONTEXT_BITS 3
  |  |  ------------------
  ------------------
  418|  1.53M|        top = AOMMIN(top, 4);                                                 \
  ------------------
  |  |   34|  1.53M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.01M, False: 512k]
  |  |  ------------------
  ------------------
  419|  1.53M|                                                                              \
  420|  1.53M|        k = 0;                                                                \
  421|  3.06M|        do {                                                                  \
  422|  3.06M|          left |= l[k];                                                       \
  423|  3.06M|        } while (++k < txb_h_unit);                                           \
  ------------------
  |  Branch (423:18): [True: 1.53M, False: 1.53M]
  ------------------
  424|  1.53M|        left &= COEFF_CONTEXT_MASK;                                           \
  ------------------
  |  |   52|  1.53M|#define COEFF_CONTEXT_MASK ((1 << COEFF_CONTEXT_BITS) - 1)
  |  |  ------------------
  |  |  |  |   51|  1.53M|#define COEFF_CONTEXT_BITS 3
  |  |  ------------------
  ------------------
  425|  1.53M|        left = AOMMIN(left, 4);                                               \
  ------------------
  |  |   34|  1.53M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.01M, False: 516k]
  |  |  ------------------
  ------------------
  426|  1.53M|                                                                              \
  427|  1.53M|        txb_ctx->txb_skip_ctx = skip_contexts[top][left];                     \
  428|  1.53M|      }                                                                       \
  429|  2.61M|    } else {                                                                  \
  430|  2.61M|      const int ctx_base = get_entropy_context(tx_size, a, l);                \
  431|  2.61M|      const int ctx_offset = (num_pels_log2_lookup[plane_bsize] >             \
  ------------------
  |  Branch (431:30): [True: 0, False: 2.61M]
  ------------------
  432|  2.61M|                              num_pels_log2_lookup[txsize_to_bsize[tx_size]]) \
  433|  2.61M|                                 ? 10                                         \
  434|  2.61M|                                 : 7;                                         \
  435|  2.61M|      txb_ctx->txb_skip_ctx = ctx_base + ctx_offset;                          \
  436|  2.61M|    }                                                                         \
  437|  5.22M|  }
decodetxb.c:get_txb_ctx_16x16:
  373|  3.08M|      TXB_CTX *const txb_ctx) {                                               \
  374|  3.08M|    static const int8_t signs[3] = { 0, -1, 1 };                              \
  375|  3.08M|    static const int8_t dc_sign_contexts[4 * MAX_TX_SIZE_UNIT + 1] = {        \
  376|  3.08M|      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,       \
  377|  3.08M|      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,       \
  378|  3.08M|      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2           \
  379|  3.08M|    };                                                                        \
  380|  3.08M|    const TX_SIZE tx_size = TX_##w##X##h;                                     \
  381|  3.08M|    const int txb_w_unit = tx_size_wide_unit[tx_size];                        \
  382|  3.08M|    const int txb_h_unit = tx_size_high_unit[tx_size];                        \
  383|  3.08M|    int dc_sign = 0;                                                          \
  384|  3.08M|    int k = 0;                                                                \
  385|  3.08M|                                                                              \
  386|  12.3M|    do {                                                                      \
  387|  12.3M|      const unsigned int sign = ((uint8_t)a[k]) >> COEFF_CONTEXT_BITS;        \
  ------------------
  |  |   51|  12.3M|#define COEFF_CONTEXT_BITS 3
  ------------------
  388|  12.3M|      assert(sign <= 2);                                                      \
  389|  12.3M|      dc_sign += signs[sign];                                                 \
  390|  12.3M|    } while (++k < txb_w_unit);                                               \
  ------------------
  |  Branch (390:14): [True: 9.26M, False: 3.08M]
  ------------------
  391|  3.08M|                                                                              \
  392|  3.08M|    k = 0;                                                                    \
  393|  12.3M|    do {                                                                      \
  394|  12.3M|      const unsigned int sign = ((uint8_t)l[k]) >> COEFF_CONTEXT_BITS;        \
  ------------------
  |  |   51|  12.3M|#define COEFF_CONTEXT_BITS 3
  ------------------
  395|  12.3M|      assert(sign <= 2);                                                      \
  396|  12.3M|      dc_sign += signs[sign];                                                 \
  397|  12.3M|    } while (++k < txb_h_unit);                                               \
  ------------------
  |  Branch (397:14): [True: 9.26M, False: 3.08M]
  ------------------
  398|  3.08M|                                                                              \
  399|  3.08M|    txb_ctx->dc_sign_ctx = dc_sign_contexts[dc_sign + 2 * MAX_TX_SIZE_UNIT];  \
  ------------------
  |  |  286|  3.08M|#define MAX_TX_SIZE_UNIT 16
  ------------------
  400|  3.08M|                                                                              \
  401|  3.08M|    if (plane == 0) {                                                         \
  ------------------
  |  Branch (401:9): [True: 1.38M, False: 1.70M]
  ------------------
  402|  1.38M|      if (plane_bsize == txsize_to_bsize[tx_size]) {                          \
  ------------------
  |  Branch (402:11): [True: 1.01M, False: 365k]
  ------------------
  403|  1.01M|        txb_ctx->txb_skip_ctx = 0;                                            \
  404|  1.01M|      } else {                                                                \
  405|   365k|        static const uint8_t skip_contexts[5][5] = { { 1, 2, 2, 2, 3 },       \
  406|   365k|                                                     { 2, 4, 4, 4, 5 },       \
  407|   365k|                                                     { 2, 4, 4, 4, 5 },       \
  408|   365k|                                                     { 2, 4, 4, 4, 5 },       \
  409|   365k|                                                     { 3, 5, 5, 5, 6 } };     \
  410|   365k|        int top = 0;                                                          \
  411|   365k|        int left = 0;                                                         \
  412|   365k|                                                                              \
  413|   365k|        k = 0;                                                                \
  414|  1.46M|        do {                                                                  \
  415|  1.46M|          top |= a[k];                                                        \
  416|  1.46M|        } while (++k < txb_w_unit);                                           \
  ------------------
  |  Branch (416:18): [True: 1.09M, False: 365k]
  ------------------
  417|   365k|        top &= COEFF_CONTEXT_MASK;                                            \
  ------------------
  |  |   52|   365k|#define COEFF_CONTEXT_MASK ((1 << COEFF_CONTEXT_BITS) - 1)
  |  |  ------------------
  |  |  |  |   51|   365k|#define COEFF_CONTEXT_BITS 3
  |  |  ------------------
  ------------------
  418|   365k|        top = AOMMIN(top, 4);                                                 \
  ------------------
  |  |   34|   365k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 243k, False: 122k]
  |  |  ------------------
  ------------------
  419|   365k|                                                                              \
  420|   365k|        k = 0;                                                                \
  421|  1.46M|        do {                                                                  \
  422|  1.46M|          left |= l[k];                                                       \
  423|  1.46M|        } while (++k < txb_h_unit);                                           \
  ------------------
  |  Branch (423:18): [True: 1.09M, False: 365k]
  ------------------
  424|   365k|        left &= COEFF_CONTEXT_MASK;                                           \
  ------------------
  |  |   52|   365k|#define COEFF_CONTEXT_MASK ((1 << COEFF_CONTEXT_BITS) - 1)
  |  |  ------------------
  |  |  |  |   51|   365k|#define COEFF_CONTEXT_BITS 3
  |  |  ------------------
  ------------------
  425|   365k|        left = AOMMIN(left, 4);                                               \
  ------------------
  |  |   34|   365k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 239k, False: 126k]
  |  |  ------------------
  ------------------
  426|   365k|                                                                              \
  427|   365k|        txb_ctx->txb_skip_ctx = skip_contexts[top][left];                     \
  428|   365k|      }                                                                       \
  429|  1.70M|    } else {                                                                  \
  430|  1.70M|      const int ctx_base = get_entropy_context(tx_size, a, l);                \
  431|  1.70M|      const int ctx_offset = (num_pels_log2_lookup[plane_bsize] >             \
  ------------------
  |  Branch (431:30): [True: 0, False: 1.70M]
  ------------------
  432|  1.70M|                              num_pels_log2_lookup[txsize_to_bsize[tx_size]]) \
  433|  1.70M|                                 ? 10                                         \
  434|  1.70M|                                 : 7;                                         \
  435|  1.70M|      txb_ctx->txb_skip_ctx = ctx_base + ctx_offset;                          \
  436|  1.70M|    }                                                                         \
  437|  3.08M|  }
decodetxb.c:get_txb_ctx_32x32:
  373|  2.18M|      TXB_CTX *const txb_ctx) {                                               \
  374|  2.18M|    static const int8_t signs[3] = { 0, -1, 1 };                              \
  375|  2.18M|    static const int8_t dc_sign_contexts[4 * MAX_TX_SIZE_UNIT + 1] = {        \
  376|  2.18M|      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,       \
  377|  2.18M|      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,       \
  378|  2.18M|      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2           \
  379|  2.18M|    };                                                                        \
  380|  2.18M|    const TX_SIZE tx_size = TX_##w##X##h;                                     \
  381|  2.18M|    const int txb_w_unit = tx_size_wide_unit[tx_size];                        \
  382|  2.18M|    const int txb_h_unit = tx_size_high_unit[tx_size];                        \
  383|  2.18M|    int dc_sign = 0;                                                          \
  384|  2.18M|    int k = 0;                                                                \
  385|  2.18M|                                                                              \
  386|  17.5M|    do {                                                                      \
  387|  17.5M|      const unsigned int sign = ((uint8_t)a[k]) >> COEFF_CONTEXT_BITS;        \
  ------------------
  |  |   51|  17.5M|#define COEFF_CONTEXT_BITS 3
  ------------------
  388|  17.5M|      assert(sign <= 2);                                                      \
  389|  17.5M|      dc_sign += signs[sign];                                                 \
  390|  17.5M|    } while (++k < txb_w_unit);                                               \
  ------------------
  |  Branch (390:14): [True: 15.3M, False: 2.18M]
  ------------------
  391|  2.18M|                                                                              \
  392|  2.18M|    k = 0;                                                                    \
  393|  17.5M|    do {                                                                      \
  394|  17.5M|      const unsigned int sign = ((uint8_t)l[k]) >> COEFF_CONTEXT_BITS;        \
  ------------------
  |  |   51|  17.5M|#define COEFF_CONTEXT_BITS 3
  ------------------
  395|  17.5M|      assert(sign <= 2);                                                      \
  396|  17.5M|      dc_sign += signs[sign];                                                 \
  397|  17.5M|    } while (++k < txb_h_unit);                                               \
  ------------------
  |  Branch (397:14): [True: 15.3M, False: 2.18M]
  ------------------
  398|  2.18M|                                                                              \
  399|  2.18M|    txb_ctx->dc_sign_ctx = dc_sign_contexts[dc_sign + 2 * MAX_TX_SIZE_UNIT];  \
  ------------------
  |  |  286|  2.18M|#define MAX_TX_SIZE_UNIT 16
  ------------------
  400|  2.18M|                                                                              \
  401|  2.18M|    if (plane == 0) {                                                         \
  ------------------
  |  Branch (401:9): [True: 756k, False: 1.43M]
  ------------------
  402|   756k|      if (plane_bsize == txsize_to_bsize[tx_size]) {                          \
  ------------------
  |  Branch (402:11): [True: 667k, False: 88.9k]
  ------------------
  403|   667k|        txb_ctx->txb_skip_ctx = 0;                                            \
  404|   667k|      } else {                                                                \
  405|  88.9k|        static const uint8_t skip_contexts[5][5] = { { 1, 2, 2, 2, 3 },       \
  406|  88.9k|                                                     { 2, 4, 4, 4, 5 },       \
  407|  88.9k|                                                     { 2, 4, 4, 4, 5 },       \
  408|  88.9k|                                                     { 2, 4, 4, 4, 5 },       \
  409|  88.9k|                                                     { 3, 5, 5, 5, 6 } };     \
  410|  88.9k|        int top = 0;                                                          \
  411|  88.9k|        int left = 0;                                                         \
  412|  88.9k|                                                                              \
  413|  88.9k|        k = 0;                                                                \
  414|   711k|        do {                                                                  \
  415|   711k|          top |= a[k];                                                        \
  416|   711k|        } while (++k < txb_w_unit);                                           \
  ------------------
  |  Branch (416:18): [True: 622k, False: 88.9k]
  ------------------
  417|  88.9k|        top &= COEFF_CONTEXT_MASK;                                            \
  ------------------
  |  |   52|  88.9k|#define COEFF_CONTEXT_MASK ((1 << COEFF_CONTEXT_BITS) - 1)
  |  |  ------------------
  |  |  |  |   51|  88.9k|#define COEFF_CONTEXT_BITS 3
  |  |  ------------------
  ------------------
  418|  88.9k|        top = AOMMIN(top, 4);                                                 \
  ------------------
  |  |   34|  88.9k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 76.4k, False: 12.4k]
  |  |  ------------------
  ------------------
  419|  88.9k|                                                                              \
  420|  88.9k|        k = 0;                                                                \
  421|   711k|        do {                                                                  \
  422|   711k|          left |= l[k];                                                       \
  423|   711k|        } while (++k < txb_h_unit);                                           \
  ------------------
  |  Branch (423:18): [True: 622k, False: 88.9k]
  ------------------
  424|  88.9k|        left &= COEFF_CONTEXT_MASK;                                           \
  ------------------
  |  |   52|  88.9k|#define COEFF_CONTEXT_MASK ((1 << COEFF_CONTEXT_BITS) - 1)
  |  |  ------------------
  |  |  |  |   51|  88.9k|#define COEFF_CONTEXT_BITS 3
  |  |  ------------------
  ------------------
  425|  88.9k|        left = AOMMIN(left, 4);                                               \
  ------------------
  |  |   34|  88.9k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 75.5k, False: 13.3k]
  |  |  ------------------
  ------------------
  426|  88.9k|                                                                              \
  427|  88.9k|        txb_ctx->txb_skip_ctx = skip_contexts[top][left];                     \
  428|  88.9k|      }                                                                       \
  429|  1.43M|    } else {                                                                  \
  430|  1.43M|      const int ctx_base = get_entropy_context(tx_size, a, l);                \
  431|  1.43M|      const int ctx_offset = (num_pels_log2_lookup[plane_bsize] >             \
  ------------------
  |  Branch (431:30): [True: 1.11M, False: 315k]
  ------------------
  432|  1.43M|                              num_pels_log2_lookup[txsize_to_bsize[tx_size]]) \
  433|  1.43M|                                 ? 10                                         \
  434|  1.43M|                                 : 7;                                         \
  435|  1.43M|      txb_ctx->txb_skip_ctx = ctx_base + ctx_offset;                          \
  436|  1.43M|    }                                                                         \
  437|  2.18M|  }
decodetxb.c:get_txb_ctx_general:
  285|  15.2M|                                TXB_CTX *const txb_ctx) {
  286|  15.2M|#define MAX_TX_SIZE_UNIT 16
  287|  15.2M|  static const int8_t signs[3] = { 0, -1, 1 };
  288|  15.2M|  static const int8_t dc_sign_contexts[4 * MAX_TX_SIZE_UNIT + 1] = {
  289|  15.2M|    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  290|  15.2M|    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  291|  15.2M|    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
  292|  15.2M|  };
  293|  15.2M|  const int txb_w_unit = tx_size_wide_unit[tx_size];
  294|  15.2M|  const int txb_h_unit = tx_size_high_unit[tx_size];
  295|  15.2M|  int dc_sign = 0;
  296|  15.2M|  int k = 0;
  297|       |
  298|  68.4M|  do {
  299|  68.4M|    const unsigned int sign = ((uint8_t)a[k]) >> COEFF_CONTEXT_BITS;
  ------------------
  |  |   51|  68.4M|#define COEFF_CONTEXT_BITS 3
  ------------------
  300|  68.4M|    assert(sign <= 2);
  301|  68.4M|    dc_sign += signs[sign];
  302|  68.4M|  } while (++k < txb_w_unit);
  ------------------
  |  Branch (302:12): [True: 53.1M, False: 15.2M]
  ------------------
  303|       |
  304|  15.2M|  k = 0;
  305|  45.1M|  do {
  306|  45.1M|    const unsigned int sign = ((uint8_t)l[k]) >> COEFF_CONTEXT_BITS;
  ------------------
  |  |   51|  45.1M|#define COEFF_CONTEXT_BITS 3
  ------------------
  307|  45.1M|    assert(sign <= 2);
  308|  45.1M|    dc_sign += signs[sign];
  309|  45.1M|  } while (++k < txb_h_unit);
  ------------------
  |  Branch (309:12): [True: 29.8M, False: 15.2M]
  ------------------
  310|       |
  311|  15.2M|  txb_ctx->dc_sign_ctx = dc_sign_contexts[dc_sign + 2 * MAX_TX_SIZE_UNIT];
  ------------------
  |  |  286|  15.2M|#define MAX_TX_SIZE_UNIT 16
  ------------------
  312|       |
  313|  15.2M|  if (plane == 0) {
  ------------------
  |  Branch (313:7): [True: 6.12M, False: 9.13M]
  ------------------
  314|  6.12M|    if (plane_bsize == txsize_to_bsize[tx_size]) {
  ------------------
  |  Branch (314:9): [True: 5.23M, False: 888k]
  ------------------
  315|  5.23M|      txb_ctx->txb_skip_ctx = 0;
  316|  5.23M|    } else {
  317|       |      // This is the algorithm to generate table skip_contexts[top][left].
  318|       |      //    const int max = AOMMIN(top | left, 4);
  319|       |      //    const int min = AOMMIN(AOMMIN(top, left), 4);
  320|       |      //    if (!max)
  321|       |      //      txb_skip_ctx = 1;
  322|       |      //    else if (!min)
  323|       |      //      txb_skip_ctx = 2 + (max > 3);
  324|       |      //    else if (max <= 3)
  325|       |      //      txb_skip_ctx = 4;
  326|       |      //    else if (min <= 3)
  327|       |      //      txb_skip_ctx = 5;
  328|       |      //    else
  329|       |      //      txb_skip_ctx = 6;
  330|   888k|      static const uint8_t skip_contexts[5][5] = { { 1, 2, 2, 2, 3 },
  331|   888k|                                                   { 2, 4, 4, 4, 5 },
  332|   888k|                                                   { 2, 4, 4, 4, 5 },
  333|   888k|                                                   { 2, 4, 4, 4, 5 },
  334|   888k|                                                   { 3, 5, 5, 5, 6 } };
  335|       |      // For top and left, we only care about which of the following three
  336|       |      // categories they belong to: { 0 }, { 1, 2, 3 }, or { 4, 5, ... }. The
  337|       |      // spec calculates top and left with the Max() function. We can calculate
  338|       |      // an approximate max with bitwise OR because the real max and the
  339|       |      // approximate max belong to the same category.
  340|   888k|      int top = 0;
  341|   888k|      int left = 0;
  342|       |
  343|   888k|      k = 0;
  344|  8.44M|      do {
  345|  8.44M|        top |= a[k];
  346|  8.44M|      } while (++k < txb_w_unit);
  ------------------
  |  Branch (346:16): [True: 7.56M, False: 888k]
  ------------------
  347|   888k|      top &= COEFF_CONTEXT_MASK;
  ------------------
  |  |   52|   888k|#define COEFF_CONTEXT_MASK ((1 << COEFF_CONTEXT_BITS) - 1)
  |  |  ------------------
  |  |  |  |   51|   888k|#define COEFF_CONTEXT_BITS 3
  |  |  ------------------
  ------------------
  348|   888k|      top = AOMMIN(top, 4);
  ------------------
  |  |   34|   888k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 659k, False: 229k]
  |  |  ------------------
  ------------------
  349|       |
  350|   888k|      k = 0;
  351|  8.33M|      do {
  352|  8.33M|        left |= l[k];
  353|  8.33M|      } while (++k < txb_h_unit);
  ------------------
  |  Branch (353:16): [True: 7.44M, False: 888k]
  ------------------
  354|   888k|      left &= COEFF_CONTEXT_MASK;
  ------------------
  |  |   52|   888k|#define COEFF_CONTEXT_MASK ((1 << COEFF_CONTEXT_BITS) - 1)
  |  |  ------------------
  |  |  |  |   51|   888k|#define COEFF_CONTEXT_BITS 3
  |  |  ------------------
  ------------------
  355|   888k|      left = AOMMIN(left, 4);
  ------------------
  |  |   34|   888k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 652k, False: 236k]
  |  |  ------------------
  ------------------
  356|       |
  357|   888k|      txb_ctx->txb_skip_ctx = skip_contexts[top][left];
  358|   888k|    }
  359|  9.13M|  } else {
  360|  9.13M|    const int ctx_base = get_entropy_context(tx_size, a, l);
  361|  9.13M|    const int ctx_offset = (num_pels_log2_lookup[plane_bsize] >
  ------------------
  |  Branch (361:28): [True: 110k, False: 9.02M]
  ------------------
  362|  9.13M|                            num_pels_log2_lookup[txsize_to_bsize[tx_size]])
  363|  9.13M|                               ? 10
  364|  9.13M|                               : 7;
  365|  9.13M|    txb_ctx->txb_skip_ctx = ctx_base + ctx_offset;
  366|  9.13M|  }
  367|  15.2M|}
decodetxb.c:get_txb_bhl:
   50|  36.1M|static inline int get_txb_bhl(TX_SIZE tx_size) {
   51|  36.1M|  tx_size = av1_get_adjusted_tx_size(tx_size);
   52|  36.1M|  return tx_size_high_log2[tx_size];
   53|  36.1M|}
decodetxb.c:get_txb_wide:
   55|  36.9M|static inline int get_txb_wide(TX_SIZE tx_size) {
   56|  36.9M|  tx_size = av1_get_adjusted_tx_size(tx_size);
   57|  36.9M|  return tx_size_wide[tx_size];
   58|  36.9M|}
decodetxb.c:get_txb_high:
   60|  35.3M|static inline int get_txb_high(TX_SIZE tx_size) {
   61|  35.3M|  tx_size = av1_get_adjusted_tx_size(tx_size);
   62|  35.3M|  return tx_size_high[tx_size];
   63|  35.3M|}
decodetxb.c:set_levels:
   65|  34.5M|static inline uint8_t *set_levels(uint8_t *const levels_buf, const int height) {
   66|  34.5M|  return levels_buf + TX_PAD_TOP * (height + TX_PAD_HOR);
  ------------------
  |  |  193|  34.5M|#define TX_PAD_TOP 0
  ------------------
                return levels_buf + TX_PAD_TOP * (height + TX_PAD_HOR);
  ------------------
  |  |  190|  34.5M|#define TX_PAD_HOR 4
  ------------------
   67|  34.5M|}
decodetxb.c:get_lower_levels_ctx_eob:
  229|  21.7M|static inline int get_lower_levels_ctx_eob(int bhl, int width, int scan_idx) {
  230|  21.7M|  if (scan_idx == 0) return 0;
  ------------------
  |  Branch (230:7): [True: 9.78M, False: 11.9M]
  ------------------
  231|  11.9M|  if (scan_idx <= (width << bhl) / 8) return 1;
  ------------------
  |  Branch (231:7): [True: 5.46M, False: 6.49M]
  ------------------
  232|  6.49M|  if (scan_idx <= (width << bhl) / 4) return 2;
  ------------------
  |  Branch (232:7): [True: 1.75M, False: 4.74M]
  ------------------
  233|  4.74M|  return 3;
  234|  6.49M|}
decodetxb.c:get_br_ctx_eob:
   92|   660k|                                           const TX_CLASS tx_class) {
   93|   660k|  const int col = c >> bhl;
   94|   660k|  const int row = c - (col << bhl);
   95|   660k|  if (c == 0) return 0;
  ------------------
  |  Branch (95:7): [True: 397k, False: 263k]
  ------------------
   96|   263k|  if ((tx_class == TX_CLASS_2D && row < 2 && col < 2) ||
  ------------------
  |  Branch (96:8): [True: 246k, False: 16.1k]
  |  Branch (96:35): [True: 57.9k, False: 189k]
  |  Branch (96:46): [True: 17.2k, False: 40.6k]
  ------------------
   97|   263k|      (tx_class == TX_CLASS_HORIZ && col == 0) ||
  ------------------
  |  Branch (97:8): [True: 11.6k, False: 234k]
  |  Branch (97:38): [True: 2.03k, False: 9.56k]
  ------------------
   98|   263k|      (tx_class == TX_CLASS_VERT && row == 0))
  ------------------
  |  Branch (98:8): [True: 4.61k, False: 239k]
  |  Branch (98:37): [True: 524, False: 4.08k]
  ------------------
   99|  19.8k|    return 7;
  100|   243k|  return 14;
  101|   263k|}
decodetxb.c:get_padded_idx:
   69|   786M|static inline int get_padded_idx(const int idx, const int bhl) {
   70|   786M|  return idx + ((idx >> bhl) << TX_PAD_HOR_LOG2);
  ------------------
  |  |  189|   786M|#define TX_PAD_HOR_LOG2 2
  ------------------
   71|   786M|}
decodetxb.c:get_lower_levels_ctx_2d:
  237|   224M|                                          int bhl, TX_SIZE tx_size) {
  238|   224M|  assert(coeff_idx > 0);
  239|   224M|  int mag;
  240|       |  // Note: AOMMIN(level, 3) is useless for decoder since level < 3.
  241|   224M|  levels = levels + get_padded_idx(coeff_idx, bhl);
  242|   224M|  mag = AOMMIN(levels[(1 << bhl) + TX_PAD_HOR], 3);               // { 0, 1 }
  ------------------
  |  |   34|   224M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 212M, False: 12.1M]
  |  |  ------------------
  ------------------
  243|   224M|  mag += AOMMIN(levels[1], 3);                                    // { 1, 0 }
  ------------------
  |  |   34|   224M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 212M, False: 12.0M]
  |  |  ------------------
  ------------------
  244|   224M|  mag += AOMMIN(levels[(1 << bhl) + TX_PAD_HOR + 1], 3);          // { 1, 1 }
  ------------------
  |  |   34|   224M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 214M, False: 9.54M]
  |  |  ------------------
  ------------------
  245|   224M|  mag += AOMMIN(levels[(2 << bhl) + (2 << TX_PAD_HOR_LOG2)], 3);  // { 0, 2 }
  ------------------
  |  |   34|   224M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 215M, False: 9.28M]
  |  |  ------------------
  ------------------
  246|   224M|  mag += AOMMIN(levels[2], 3);                                    // { 2, 0 }
  ------------------
  |  |   34|   224M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 215M, False: 9.11M]
  |  |  ------------------
  ------------------
  247|       |
  248|   224M|  const int ctx = AOMMIN((mag + 1) >> 1, 4);
  ------------------
  |  |   34|   224M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 209M, False: 14.5M]
  |  |  ------------------
  ------------------
  249|   224M|  return ctx + av1_nz_map_ctx_offset[tx_size][coeff_idx];
  250|   224M|}
decodetxb.c:get_br_ctx_2d:
   75|  15.0M|                                const int bhl) {
   76|  15.0M|  assert(c > 0);
   77|  15.0M|  const int col = c >> bhl;
   78|  15.0M|  const int row = c - (col << bhl);
   79|  15.0M|  const int stride = (1 << bhl) + TX_PAD_HOR;
  ------------------
  |  |  190|  15.0M|#define TX_PAD_HOR 4
  ------------------
   80|  15.0M|  const int pos = col * stride + row;
   81|  15.0M|  int mag = AOMMIN(levels[pos + 1], MAX_BASE_BR_RANGE) +
  ------------------
  |  |   34|  15.0M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 14.8M, False: 138k]
  |  |  ------------------
  ------------------
   82|  15.0M|            AOMMIN(levels[pos + stride], MAX_BASE_BR_RANGE) +
  ------------------
  |  |   34|  15.0M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 14.8M, False: 143k]
  |  |  ------------------
  ------------------
   83|  15.0M|            AOMMIN(levels[pos + 1 + stride], MAX_BASE_BR_RANGE);
  ------------------
  |  |   34|  15.0M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 14.9M, False: 80.3k]
  |  |  ------------------
  ------------------
   84|  15.0M|  mag = AOMMIN((mag + 1) >> 1, 6);
  ------------------
  |  |   34|  15.0M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 13.2M, False: 1.77M]
  |  |  ------------------
  ------------------
   85|       |  //((row | col) < 2) is equivalent to ((row < 2) && (col < 2))
   86|  15.0M|  if ((row | col) < 2) return mag + 7;
  ------------------
  |  Branch (86:7): [True: 3.37M, False: 11.6M]
  ------------------
   87|  11.6M|  return mag + 14;
   88|  15.0M|}
decodetxb.c:get_lower_levels_ctx:
  254|  30.4M|                                                 TX_CLASS tx_class) {
  255|  30.4M|  const int stats =
  256|  30.4M|      get_nz_mag(levels + get_padded_idx(coeff_idx, bhl), bhl, tx_class);
  257|  30.4M|  return get_nz_map_ctx_from_stats(stats, coeff_idx, bhl, tx_size, tx_class);
  258|  30.4M|}
decodetxb.c:get_nz_mag:
  151|  30.4M|                                       const int bhl, const TX_CLASS tx_class) {
  152|  30.4M|  int mag;
  153|       |
  154|       |  // Note: AOMMIN(level, 3) is useless for decoder since level < 3.
  155|  30.4M|  mag = clip_max3[levels[(1 << bhl) + TX_PAD_HOR]];  // { 0, 1 }
  ------------------
  |  |  190|  30.4M|#define TX_PAD_HOR 4
  ------------------
  156|  30.4M|  mag += clip_max3[levels[1]];                       // { 1, 0 }
  157|       |
  158|  30.4M|  if (tx_class == TX_CLASS_2D) {
  ------------------
  |  Branch (158:7): [True: 10.9M, False: 19.4M]
  ------------------
  159|  10.9M|    mag += clip_max3[levels[(1 << bhl) + TX_PAD_HOR + 1]];          // { 1, 1 }
  ------------------
  |  |  190|  10.9M|#define TX_PAD_HOR 4
  ------------------
  160|  10.9M|    mag += clip_max3[levels[(2 << bhl) + (2 << TX_PAD_HOR_LOG2)]];  // { 0, 2 }
  ------------------
  |  |  189|  10.9M|#define TX_PAD_HOR_LOG2 2
  ------------------
  161|  10.9M|    mag += clip_max3[levels[2]];                                    // { 2, 0 }
  162|  19.4M|  } else if (tx_class == TX_CLASS_VERT) {
  ------------------
  |  Branch (162:14): [True: 6.19M, False: 13.3M]
  ------------------
  163|  6.19M|    mag += clip_max3[levels[2]];  // { 2, 0 }
  164|  6.19M|    mag += clip_max3[levels[3]];  // { 3, 0 }
  165|  6.19M|    mag += clip_max3[levels[4]];  // { 4, 0 }
  166|  13.3M|  } else {
  167|  13.3M|    mag += clip_max3[levels[(2 << bhl) + (2 << TX_PAD_HOR_LOG2)]];  // { 0, 2 }
  ------------------
  |  |  189|  13.3M|#define TX_PAD_HOR_LOG2 2
  ------------------
  168|  13.3M|    mag += clip_max3[levels[(3 << bhl) + (3 << TX_PAD_HOR_LOG2)]];  // { 0, 3 }
  ------------------
  |  |  189|  13.3M|#define TX_PAD_HOR_LOG2 2
  ------------------
  169|  13.3M|    mag += clip_max3[levels[(4 << bhl) + (4 << TX_PAD_HOR_LOG2)]];  // { 0, 4 }
  ------------------
  |  |  189|  13.3M|#define TX_PAD_HOR_LOG2 2
  ------------------
  170|  13.3M|  }
  171|       |
  172|  30.4M|  return mag;
  173|  30.4M|}
decodetxb.c:get_nz_map_ctx_from_stats:
  192|  30.4M|    const int bhl, const TX_SIZE tx_size, const TX_CLASS tx_class) {
  193|       |  // tx_class == 0(TX_CLASS_2D)
  194|  30.4M|  if ((tx_class | coeff_idx) == 0) return 0;
  ------------------
  |  Branch (194:7): [True: 10.9M, False: 19.5M]
  ------------------
  195|  19.5M|  int ctx = (stats + 1) >> 1;
  196|  19.5M|  ctx = AOMMIN(ctx, 4);
  ------------------
  |  |   34|  19.5M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 19.0M, False: 451k]
  |  |  ------------------
  ------------------
  197|  19.5M|  switch (tx_class) {
  198|      0|    case TX_CLASS_2D: {
  ------------------
  |  Branch (198:5): [True: 0, False: 19.5M]
  ------------------
  199|       |      // This is the algorithm to generate av1_nz_map_ctx_offset[][]
  200|       |      //   const int width = tx_size_wide[tx_size];
  201|       |      //   const int height = tx_size_high[tx_size];
  202|       |      //   if (width < height) {
  203|       |      //     if (row < 2) return 11 + ctx;
  204|       |      //   } else if (width > height) {
  205|       |      //     if (col < 2) return 16 + ctx;
  206|       |      //   }
  207|       |      //   if (row + col < 2) return ctx + 1;
  208|       |      //   if (row + col < 4) return 5 + ctx + 1;
  209|       |      //   return 21 + ctx;
  210|      0|      return ctx + av1_nz_map_ctx_offset[tx_size][coeff_idx];
  211|      0|    }
  212|  13.3M|    case TX_CLASS_HORIZ: {
  ------------------
  |  Branch (212:5): [True: 13.3M, False: 6.13M]
  ------------------
  213|  13.3M|      const int col = coeff_idx >> bhl;
  214|  13.3M|      return ctx + nz_map_ctx_offset_1d[col];
  215|      0|    }
  216|  6.19M|    case TX_CLASS_VERT: {
  ------------------
  |  Branch (216:5): [True: 6.19M, False: 13.3M]
  ------------------
  217|  6.19M|      const int col = coeff_idx >> bhl;
  218|  6.19M|      const int row = coeff_idx - (col << bhl);
  219|  6.19M|      return ctx + nz_map_ctx_offset_1d[row];
  220|      0|    }
  221|      0|    default: break;
  ------------------
  |  Branch (221:5): [True: 0, False: 19.5M]
  ------------------
  222|  19.5M|  }
  223|      0|  return 0;
  224|  19.5M|}
decodetxb.c:get_br_ctx:
  105|  4.94M|                                       const int bhl, const TX_CLASS tx_class) {
  106|  4.94M|  const int col = c >> bhl;
  107|  4.94M|  const int row = c - (col << bhl);
  108|  4.94M|  const int stride = (1 << bhl) + TX_PAD_HOR;
  ------------------
  |  |  190|  4.94M|#define TX_PAD_HOR 4
  ------------------
  109|  4.94M|  const int pos = col * stride + row;
  110|  4.94M|  int mag = levels[pos + 1];
  111|  4.94M|  mag += levels[pos + stride];
  112|  4.94M|  switch (tx_class) {
  113|  3.94M|    case TX_CLASS_2D:
  ------------------
  |  Branch (113:5): [True: 3.94M, False: 1.00M]
  ------------------
  114|  3.94M|      mag += levels[pos + stride + 1];
  115|  3.94M|      mag = AOMMIN((mag + 1) >> 1, 6);
  ------------------
  |  |   34|  3.94M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 3.58M, False: 353k]
  |  |  ------------------
  ------------------
  116|  3.94M|      if (c == 0) return mag;
  ------------------
  |  Branch (116:11): [True: 3.94M, False: 18.4E]
  ------------------
  117|  18.4E|      if ((row < 2) && (col < 2)) return mag + 7;
  ------------------
  |  Branch (117:11): [True: 0, False: 18.4E]
  |  Branch (117:24): [True: 0, False: 0]
  ------------------
  118|  18.4E|      break;
  119|  18.4E|    case TX_CLASS_HORIZ:
  ------------------
  |  Branch (119:5): [True: 698k, False: 4.25M]
  ------------------
  120|   698k|      mag += levels[pos + (stride << 1)];
  121|   698k|      mag = AOMMIN((mag + 1) >> 1, 6);
  ------------------
  |  |   34|   698k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 651k, False: 46.7k]
  |  |  ------------------
  ------------------
  122|   698k|      if (c == 0) return mag;
  ------------------
  |  Branch (122:11): [True: 68.6k, False: 629k]
  ------------------
  123|   629k|      if (col == 0) return mag + 7;
  ------------------
  |  Branch (123:11): [True: 291k, False: 338k]
  ------------------
  124|   338k|      break;
  125|   338k|    case TX_CLASS_VERT:
  ------------------
  |  Branch (125:5): [True: 309k, False: 4.64M]
  ------------------
  126|   309k|      mag += levels[pos + 2];
  127|   309k|      mag = AOMMIN((mag + 1) >> 1, 6);
  ------------------
  |  |   34|   309k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 287k, False: 21.6k]
  |  |  ------------------
  ------------------
  128|   309k|      if (c == 0) return mag;
  ------------------
  |  Branch (128:11): [True: 31.9k, False: 277k]
  ------------------
  129|   277k|      if (row == 0) return mag + 7;
  ------------------
  |  Branch (129:11): [True: 138k, False: 138k]
  ------------------
  130|   138k|      break;
  131|   138k|    default: break;
  ------------------
  |  Branch (131:5): [True: 0, False: 4.94M]
  ------------------
  132|  4.94M|  }
  133|       |
  134|   477k|  return mag + 14;
  135|  4.94M|}
decodetxb.c:set_dc_sign:
  274|  21.6M|static inline void set_dc_sign(int *cul_level, int dc_val) {
  275|  21.6M|  if (dc_val < 0)
  ------------------
  |  Branch (275:7): [True: 5.65M, False: 16.0M]
  ------------------
  276|  5.65M|    *cul_level |= 1 << COEFF_CONTEXT_BITS;
  ------------------
  |  |   51|  5.65M|#define COEFF_CONTEXT_BITS 3
  ------------------
  277|  16.0M|  else if (dc_val > 0)
  ------------------
  |  Branch (277:12): [True: 12.7M, False: 3.26M]
  ------------------
  278|  12.7M|    *cul_level += 2 << COEFF_CONTEXT_BITS;
  ------------------
  |  |   51|  12.7M|#define COEFF_CONTEXT_BITS 3
  ------------------
  279|  21.6M|}

av1_get_shear_params:
  243|   592k|int av1_get_shear_params(WarpedMotionParams *wm) {
  244|   592k|#ifndef NDEBUG
  245|       |  // Check that models have been constructed sensibly
  246|       |  // This is a good place to check, because this function does not need to
  247|       |  // be called until after model construction is complete, but must be called
  248|       |  // before the model can be used for prediction.
  249|   592k|  check_model_consistency(wm);
  250|   592k|#endif  // NDEBUG
  251|       |
  252|   592k|  const int32_t *mat = wm->wmmat;
  253|   592k|  if (!is_affine_valid(wm)) return 0;
  ------------------
  |  Branch (253:7): [True: 1.71k, False: 590k]
  ------------------
  254|       |
  255|   590k|  wm->alpha =
  256|   590k|      clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
  ------------------
  |  |   96|   590k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
  257|   590k|  wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX);
  258|   590k|  int16_t shift;
  259|   590k|  int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1);
  ------------------
  |  Branch (259:58): [True: 0, False: 590k]
  ------------------
  260|   590k|  int64_t v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y;
  ------------------
  |  |   96|   590k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
  261|   590k|  wm->gamma =
  262|   590k|      clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX);
  ------------------
  |  |   58|   590k|  (((value) < 0) ? -ROUND_POWER_OF_TWO_64(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   53|  83.3k|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (58:4): [True: 83.3k, False: 507k]
  |  |  ------------------
  |  |   59|   590k|                 : ROUND_POWER_OF_TWO_64((value), (n)))
  |  |  ------------------
  |  |  |  |   53|   507k|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  263|   590k|  v = ((int64_t)mat[3] * mat[4]) * y;
  264|   590k|  wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) -
  ------------------
  |  |   58|   590k|  (((value) < 0) ? -ROUND_POWER_OF_TWO_64(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   53|  64.6k|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (58:4): [True: 64.6k, False: 526k]
  |  |  ------------------
  |  |   59|   590k|                 : ROUND_POWER_OF_TWO_64((value), (n)))
  |  |  ------------------
  |  |  |  |   53|   526k|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  265|   590k|                        (1 << WARPEDMODEL_PREC_BITS),
  ------------------
  |  |   96|   590k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
  266|   590k|                    INT16_MIN, INT16_MAX);
  267|       |
  268|   590k|  wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) *
  ------------------
  |  |   45|   590k|  (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   41|   136k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (45:4): [True: 136k, False: 453k]
  |  |  ------------------
  |  |   46|   590k|                 : ROUND_POWER_OF_TWO((value), (n)))
  |  |  ------------------
  |  |  |  |   41|   453k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  269|   590k|              (1 << WARP_PARAM_REDUCE_BITS);
  ------------------
  |  |  105|   590k|#define WARP_PARAM_REDUCE_BITS 6
  ------------------
  270|   590k|  wm->beta = ROUND_POWER_OF_TWO_SIGNED(wm->beta, WARP_PARAM_REDUCE_BITS) *
  ------------------
  |  |   45|   590k|  (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   41|  63.0k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (45:4): [True: 63.0k, False: 527k]
  |  |  ------------------
  |  |   46|   590k|                 : ROUND_POWER_OF_TWO((value), (n)))
  |  |  ------------------
  |  |  |  |   41|   527k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  271|   590k|             (1 << WARP_PARAM_REDUCE_BITS);
  ------------------
  |  |  105|   590k|#define WARP_PARAM_REDUCE_BITS 6
  ------------------
  272|   590k|  wm->gamma = ROUND_POWER_OF_TWO_SIGNED(wm->gamma, WARP_PARAM_REDUCE_BITS) *
  ------------------
  |  |   45|   590k|  (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   41|  83.3k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (45:4): [True: 83.3k, False: 507k]
  |  |  ------------------
  |  |   46|   590k|                 : ROUND_POWER_OF_TWO((value), (n)))
  |  |  ------------------
  |  |  |  |   41|   507k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  273|   590k|              (1 << WARP_PARAM_REDUCE_BITS);
  ------------------
  |  |  105|   590k|#define WARP_PARAM_REDUCE_BITS 6
  ------------------
  274|   590k|  wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) *
  ------------------
  |  |   45|   590k|  (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   41|   145k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (45:4): [True: 145k, False: 445k]
  |  |  ------------------
  |  |   46|   590k|                 : ROUND_POWER_OF_TWO((value), (n)))
  |  |  ------------------
  |  |  |  |   41|   445k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  275|   590k|              (1 << WARP_PARAM_REDUCE_BITS);
  ------------------
  |  |  105|   590k|#define WARP_PARAM_REDUCE_BITS 6
  ------------------
  276|       |
  277|   590k|  if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta))
  ------------------
  |  Branch (277:7): [True: 26.5k, False: 564k]
  ------------------
  278|  26.5k|    return 0;
  279|       |
  280|   564k|  return 1;
  281|   590k|}
highbd_warp_plane:
  419|   264k|                       int bd, ConvolveParams *conv_params) {
  420|   264k|  const int32_t *const mat = wm->wmmat;
  421|   264k|  const int16_t alpha = wm->alpha;
  422|   264k|  const int16_t beta = wm->beta;
  423|   264k|  const int16_t gamma = wm->gamma;
  424|   264k|  const int16_t delta = wm->delta;
  425|       |
  426|   264k|  av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
  427|   264k|                         p_width, p_height, p_stride, subsampling_x,
  428|   264k|                         subsampling_y, bd, conv_params, alpha, beta, gamma,
  429|   264k|                         delta);
  430|   264k|}
warp_plane:
  651|   238k|                int subsampling_y, ConvolveParams *conv_params) {
  652|   238k|  const int32_t *const mat = wm->wmmat;
  653|   238k|  const int16_t alpha = wm->alpha;
  654|   238k|  const int16_t beta = wm->beta;
  655|   238k|  const int16_t gamma = wm->gamma;
  656|   238k|  const int16_t delta = wm->delta;
  657|   238k|  av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width,
  658|   238k|                  p_height, p_stride, subsampling_x, subsampling_y, conv_params,
  659|   238k|                  alpha, beta, gamma, delta);
  660|   238k|}
av1_warp_plane:
  666|   502k|                    int subsampling_y, ConvolveParams *conv_params) {
  667|   502k|#if CONFIG_AV1_HIGHBITDEPTH
  668|   502k|  if (use_hbd)
  ------------------
  |  Branch (668:7): [True: 264k, False: 238k]
  ------------------
  669|   264k|    highbd_warp_plane(wm, CONVERT_TO_SHORTPTR(ref), width, height, stride,
  ------------------
  |  |   75|   264k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  670|   264k|                      CONVERT_TO_SHORTPTR(pred), p_col, p_row, p_width,
  ------------------
  |  |   75|   264k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  671|   264k|                      p_height, p_stride, subsampling_x, subsampling_y, bd,
  672|   264k|                      conv_params);
  673|   238k|  else
  674|   238k|    warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
  675|   238k|               p_height, p_stride, subsampling_x, subsampling_y, conv_params);
  676|       |#else
  677|       |  (void)use_hbd;
  678|       |  (void)bd;
  679|       |  warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
  680|       |             p_height, p_stride, subsampling_x, subsampling_y, conv_params);
  681|       |#endif
  682|   502k|}
av1_find_projection:
  908|   305k|                        WarpedMotionParams *wm_params, int mi_row, int mi_col) {
  909|   305k|  assert(wm_params->wmtype == AFFINE);
  910|       |
  911|   305k|  if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row,
  ------------------
  |  Branch (911:7): [True: 29.7k, False: 276k]
  ------------------
  912|   305k|                      mi_col))
  913|  29.7k|    return 1;
  914|       |
  915|       |  // check compatibility with the fast warp filter
  916|   276k|  if (!av1_get_shear_params(wm_params)) return 1;
  ------------------
  |  Branch (916:7): [True: 18.3k, False: 257k]
  ------------------
  917|       |
  918|   257k|  return 0;
  919|   276k|}
warped_motion.c:check_model_consistency:
  222|   592k|static void check_model_consistency(WarpedMotionParams *wm) {
  223|   592k|  switch (wm->wmtype) {
  224|   289k|    case IDENTITY:
  ------------------
  |  Branch (224:5): [True: 289k, False: 302k]
  ------------------
  225|   289k|      assert(wm->wmmat[0] == 0);
  226|   289k|      assert(wm->wmmat[1] == 0);
  227|   289k|      AOM_FALLTHROUGH_INTENDED;
  ------------------
  |  |   52|   289k|  do {                           \
  |  |   53|   289k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (53:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  228|   293k|    case TRANSLATION:
  ------------------
  |  Branch (228:5): [True: 3.23k, False: 589k]
  ------------------
  229|   293k|      assert(wm->wmmat[2] == 1 << WARPEDMODEL_PREC_BITS);
  230|   293k|      assert(wm->wmmat[3] == 0);
  231|   293k|      AOM_FALLTHROUGH_INTENDED;
  ------------------
  |  |   52|   293k|  do {                           \
  |  |   53|   293k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (53:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  232|   310k|    case ROTZOOM:
  ------------------
  |  Branch (232:5): [True: 17.3k, False: 575k]
  ------------------
  233|   310k|      assert(wm->wmmat[4] == -wm->wmmat[3]);
  234|   310k|      assert(wm->wmmat[5] == wm->wmmat[2]);
  235|   310k|      AOM_FALLTHROUGH_INTENDED;
  ------------------
  |  |   52|   310k|  do {                           \
  |  |   53|   310k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (53:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  236|   592k|    case AFFINE: break;
  ------------------
  |  Branch (236:5): [True: 282k, False: 310k]
  ------------------
  237|      0|    default: assert(0 && "Bad wmtype");
  ------------------
  |  Branch (237:5): [True: 0, False: 592k]
  ------------------
  238|   592k|  }
  239|   592k|}
warped_motion.c:is_affine_valid:
  205|   592k|static int is_affine_valid(const WarpedMotionParams *const wm) {
  206|   592k|  const int32_t *mat = wm->wmmat;
  207|   592k|  return (mat[2] > 0);
  208|   592k|}
warped_motion.c:resolve_divisor_32:
  189|   590k|static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) {
  190|   590k|  int32_t f;
  191|   590k|  *shift = get_msb(D);
  192|       |  // e is obtained from D after resetting the most significant 1 bit.
  193|   590k|  const int32_t e = D - ((uint32_t)1 << *shift);
  194|       |  // Get the most significant DIV_LUT_BITS (8) bits of e into f
  195|   590k|  if (*shift > DIV_LUT_BITS)
  ------------------
  |  |  140|   590k|#define DIV_LUT_BITS 8
  ------------------
  |  Branch (195:7): [True: 590k, False: 310]
  ------------------
  196|   590k|    f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
  ------------------
  |  |   41|   590k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  197|    310|  else
  198|    310|    f = e << (DIV_LUT_BITS - *shift);
  ------------------
  |  |  140|    310|#define DIV_LUT_BITS 8
  ------------------
  199|   590k|  assert(f <= DIV_LUT_NUM);
  200|   590k|  *shift += DIV_LUT_PREC_BITS;
  ------------------
  |  |  139|   590k|#define DIV_LUT_PREC_BITS 14
  ------------------
  201|       |  // Use f as lookup into the precomputed table of multipliers
  202|   590k|  return div_lut[f];
  203|   590k|}
warped_motion.c:is_affine_shear_allowed:
  211|   590k|                                   int16_t delta) {
  212|   590k|  if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
  ------------------
  |  |   96|   590k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
  |  Branch (212:7): [True: 20.9k, False: 569k]
  ------------------
  213|   590k|      (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS)))
  ------------------
  |  |   96|   569k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
  |  Branch (213:7): [True: 5.57k, False: 564k]
  ------------------
  214|  26.5k|    return 0;
  215|   564k|  else
  216|   564k|    return 1;
  217|   590k|}
warped_motion.c:find_affine_int:
  798|   305k|                           WarpedMotionParams *wm, int mi_row, int mi_col) {
  799|   305k|  int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
  800|   305k|  int32_t Bx[2] = { 0, 0 };
  801|   305k|  int32_t By[2] = { 0, 0 };
  802|       |
  803|   305k|  const int bw = block_size_wide[bsize];
  804|   305k|  const int bh = block_size_high[bsize];
  805|   305k|  const int rsuy = bh / 2 - 1;
  806|   305k|  const int rsux = bw / 2 - 1;
  807|   305k|  const int suy = rsuy * 8;
  808|   305k|  const int sux = rsux * 8;
  809|   305k|  const int duy = suy + mvy;
  810|   305k|  const int dux = sux + mvx;
  811|       |
  812|       |  // Assume the center pixel of the block has exactly the same motion vector
  813|       |  // as transmitted for the block. First shift the origin of the source
  814|       |  // points to the block center, and the origin of the destination points to
  815|       |  // the block center added to the motion vector transmitted.
  816|       |  // Let (xi, yi) denote the source points and (xi', yi') denote destination
  817|       |  // points after origin shfifting, for i = 0, 1, 2, .... n-1.
  818|       |  // Then if  P = [x0, y0,
  819|       |  //               x1, y1
  820|       |  //               x2, y1,
  821|       |  //                ....
  822|       |  //              ]
  823|       |  //          q = [x0', x1', x2', ... ]'
  824|       |  //          r = [y0', y1', y2', ... ]'
  825|       |  // the least squares problems that need to be solved are:
  826|       |  //          [h1, h2]' = inv(P'P)P'q and
  827|       |  //          [h3, h4]' = inv(P'P)P'r
  828|       |  // where the affine transformation is given by:
  829|       |  //          x' = h1.x + h2.y
  830|       |  //          y' = h3.x + h4.y
  831|       |  //
  832|       |  // The loop below computes: A = P'P, Bx = P'q, By = P'r
  833|       |  // We need to just compute inv(A).Bx and inv(A).By for the solutions.
  834|       |  // Contribution from neighbor block
  835|   867k|  for (int i = 0; i < np; i++) {
  ------------------
  |  Branch (835:19): [True: 561k, False: 305k]
  ------------------
  836|   561k|    const int dx = pts2[i * 2] - dux;
  837|   561k|    const int dy = pts2[i * 2 + 1] - duy;
  838|   561k|    const int sx = pts1[i * 2] - sux;
  839|   561k|    const int sy = pts1[i * 2 + 1] - suy;
  840|       |    // (TODO)yunqing: This comparison wouldn't be necessary if the sample
  841|       |    // selection is done in find_samples(). Also, global offset can be removed
  842|       |    // while collecting samples.
  843|   561k|    if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
  ------------------
  |  |  684|  1.12M|#define LS_MV_MAX 256  // max mv in 1/8-pel
  ------------------
                  if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
  ------------------
  |  |  684|   544k|#define LS_MV_MAX 256  // max mv in 1/8-pel
  ------------------
  |  Branch (843:9): [True: 544k, False: 16.9k]
  |  Branch (843:37): [True: 532k, False: 12.8k]
  ------------------
  844|   532k|      A[0][0] += LS_SQUARE(sx);
  ------------------
  |  |  710|   532k|  (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
  |  |  ------------------
  |  |  |  |  686|   532k|#define LS_STEP 8
  |  |  ------------------
  |  |                 (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
  |  |  ------------------
  |  |  |  |  686|   532k|#define LS_STEP 8
  |  |  ------------------
  |  |                 (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
  |  |  ------------------
  |  |  |  |  686|   532k|#define LS_STEP 8
  |  |  ------------------
  |  |  711|   532k|   (2 + LS_MAT_DOWN_BITS))
  |  |  ------------------
  |  |  |  |  700|   532k|#define LS_MAT_DOWN_BITS 2
  |  |  ------------------
  ------------------
  845|   532k|      A[0][1] += LS_PRODUCT1(sx, sy);
  ------------------
  |  |  713|   532k|  (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
  |  |  ------------------
  |  |  |  |  686|   532k|#define LS_STEP 8
  |  |  ------------------
  |  |                 (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
  |  |  ------------------
  |  |  |  |  686|   532k|#define LS_STEP 8
  |  |  ------------------
  |  |                 (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
  |  |  ------------------
  |  |  |  |  686|   532k|#define LS_STEP 8
  |  |  ------------------
  |  |  714|   532k|   (2 + LS_MAT_DOWN_BITS))
  |  |  ------------------
  |  |  |  |  700|   532k|#define LS_MAT_DOWN_BITS 2
  |  |  ------------------
  ------------------
  846|   532k|      A[1][1] += LS_SQUARE(sy);
  ------------------
  |  |  710|   532k|  (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
  |  |  ------------------
  |  |  |  |  686|   532k|#define LS_STEP 8
  |  |  ------------------
  |  |                 (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
  |  |  ------------------
  |  |  |  |  686|   532k|#define LS_STEP 8
  |  |  ------------------
  |  |                 (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
  |  |  ------------------
  |  |  |  |  686|   532k|#define LS_STEP 8
  |  |  ------------------
  |  |  711|   532k|   (2 + LS_MAT_DOWN_BITS))
  |  |  ------------------
  |  |  |  |  700|   532k|#define LS_MAT_DOWN_BITS 2
  |  |  ------------------
  ------------------
  847|   532k|      Bx[0] += LS_PRODUCT2(sx, dx);
  ------------------
  |  |  716|   532k|  (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
  |  |  ------------------
  |  |  |  |  686|   532k|#define LS_STEP 8
  |  |  ------------------
  |  |                 (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
  |  |  ------------------
  |  |  |  |  686|   532k|#define LS_STEP 8
  |  |  ------------------
  |  |                 (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
  |  |  ------------------
  |  |  |  |  686|   532k|#define LS_STEP 8
  |  |  ------------------
  |  |  717|   532k|   (2 + LS_MAT_DOWN_BITS))
  |  |  ------------------
  |  |  |  |  700|   532k|#define LS_MAT_DOWN_BITS 2
  |  |  ------------------
  ------------------
  848|   532k|      Bx[1] += LS_PRODUCT1(sy, dx);
  ------------------
  |  |  713|   532k|  (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
  |  |  ------------------
  |  |  |  |  686|   532k|#define LS_STEP 8
  |  |  ------------------
  |  |                 (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
  |  |  ------------------
  |  |  |  |  686|   532k|#define LS_STEP 8
  |  |  ------------------
  |  |                 (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
  |  |  ------------------
  |  |  |  |  686|   532k|#define LS_STEP 8
  |  |  ------------------
  |  |  714|   532k|   (2 + LS_MAT_DOWN_BITS))
  |  |  ------------------
  |  |  |  |  700|   532k|#define LS_MAT_DOWN_BITS 2
  |  |  ------------------
  ------------------
  849|   532k|      By[0] += LS_PRODUCT1(sx, dy);
  ------------------
  |  |  713|   532k|  (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
  |  |  ------------------
  |  |  |  |  686|   532k|#define LS_STEP 8
  |  |  ------------------
  |  |                 (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
  |  |  ------------------
  |  |  |  |  686|   532k|#define LS_STEP 8
  |  |  ------------------
  |  |                 (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
  |  |  ------------------
  |  |  |  |  686|   532k|#define LS_STEP 8
  |  |  ------------------
  |  |  714|   532k|   (2 + LS_MAT_DOWN_BITS))
  |  |  ------------------
  |  |  |  |  700|   532k|#define LS_MAT_DOWN_BITS 2
  |  |  ------------------
  ------------------
  850|   532k|      By[1] += LS_PRODUCT2(sy, dy);
  ------------------
  |  |  716|   532k|  (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
  |  |  ------------------
  |  |  |  |  686|   532k|#define LS_STEP 8
  |  |  ------------------
  |  |                 (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
  |  |  ------------------
  |  |  |  |  686|   532k|#define LS_STEP 8
  |  |  ------------------
  |  |                 (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
  |  |  ------------------
  |  |  |  |  686|   532k|#define LS_STEP 8
  |  |  ------------------
  |  |  717|   532k|   (2 + LS_MAT_DOWN_BITS))
  |  |  ------------------
  |  |  |  |  700|   532k|#define LS_MAT_DOWN_BITS 2
  |  |  ------------------
  ------------------
  851|   532k|    }
  852|   561k|  }
  853|       |
  854|       |  // Just for debugging, and can be removed later.
  855|   305k|  assert(A[0][0] >= LS_MAT_MIN && A[0][0] <= LS_MAT_MAX);
  856|   305k|  assert(A[0][1] >= LS_MAT_MIN && A[0][1] <= LS_MAT_MAX);
  857|   305k|  assert(A[1][1] >= LS_MAT_MIN && A[1][1] <= LS_MAT_MAX);
  858|   305k|  assert(Bx[0] >= LS_MAT_MIN && Bx[0] <= LS_MAT_MAX);
  859|   305k|  assert(Bx[1] >= LS_MAT_MIN && Bx[1] <= LS_MAT_MAX);
  860|   305k|  assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX);
  861|   305k|  assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX);
  862|       |
  863|       |  // Compute Determinant of A
  864|   305k|  const int64_t Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
  865|   305k|  if (Det == 0) return 1;
  ------------------
  |  Branch (865:7): [True: 29.7k, False: 276k]
  ------------------
  866|       |
  867|   276k|  int16_t shift;
  868|   276k|  int16_t iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
  ------------------
  |  Branch (868:60): [True: 0, False: 276k]
  ------------------
  869|   276k|  shift -= WARPEDMODEL_PREC_BITS;
  ------------------
  |  |   96|   276k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
  870|   276k|  if (shift < 0) {
  ------------------
  |  Branch (870:7): [True: 0, False: 276k]
  ------------------
  871|      0|    iDet <<= (-shift);
  872|      0|    shift = 0;
  873|      0|  }
  874|       |
  875|   276k|  int64_t Px[2], Py[2];
  876|       |  // These divided by the Det, are the least squares solutions
  877|   276k|  Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
  878|   276k|  Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
  879|   276k|  Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1];
  880|   276k|  Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1];
  881|       |
  882|   276k|  wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift);
  883|   276k|  wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift);
  884|   276k|  wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift);
  885|   276k|  wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift);
  886|       |
  887|   276k|  const int isuy = (mi_row * MI_SIZE + rsuy);
  ------------------
  |  |   40|   276k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   276k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  888|   276k|  const int isux = (mi_col * MI_SIZE + rsux);
  ------------------
  |  |   40|   276k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   276k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  889|       |  // Note: In the vx, vy expressions below, the max value of each of the
  890|       |  // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room
  891|       |  // for the first term so that the overall sum in the worst case fits
  892|       |  // within 32 bits overall.
  893|   276k|  const int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
  ------------------
  |  |   96|   276k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
  894|   276k|                     (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
  ------------------
  |  |   96|   276k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
  895|   276k|                      isuy * wm->wmmat[3]);
  896|   276k|  const int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
  ------------------
  |  |   96|   276k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
  897|   276k|                     (isux * wm->wmmat[4] +
  898|   276k|                      isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
  ------------------
  |  |   96|   276k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
  899|   276k|  wm->wmmat[0] =
  900|   276k|      clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
  ------------------
  |  |   98|   276k|#define WARPEDMODEL_TRANS_CLAMP (128 << WARPEDMODEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   276k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  ------------------
                    clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
  ------------------
  |  |   98|   276k|#define WARPEDMODEL_TRANS_CLAMP (128 << WARPEDMODEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   276k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  ------------------
  901|   276k|  wm->wmmat[1] =
  902|   276k|      clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
  ------------------
  |  |   98|   276k|#define WARPEDMODEL_TRANS_CLAMP (128 << WARPEDMODEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   276k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  ------------------
                    clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
  ------------------
  |  |   98|   276k|#define WARPEDMODEL_TRANS_CLAMP (128 << WARPEDMODEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   276k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  ------------------
  903|   276k|  return 0;
  904|   305k|}
warped_motion.c:resolve_divisor_64:
  172|   276k|static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
  173|   276k|  int64_t f;
  174|   276k|  *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
  ------------------
  |  Branch (174:22): [True: 2.64k, False: 273k]
  ------------------
  175|   276k|                               : get_msb((unsigned int)D));
  176|       |  // e is obtained from D after resetting the most significant 1 bit.
  177|   276k|  const int64_t e = D - ((uint64_t)1 << *shift);
  178|       |  // Get the most significant DIV_LUT_BITS (8) bits of e into f
  179|   276k|  if (*shift > DIV_LUT_BITS)
  ------------------
  |  |  140|   276k|#define DIV_LUT_BITS 8
  ------------------
  |  Branch (179:7): [True: 276k, False: 0]
  ------------------
  180|   276k|    f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
  ------------------
  |  |   53|   276k|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  ------------------
  181|      0|  else
  182|      0|    f = e << (DIV_LUT_BITS - *shift);
  ------------------
  |  |  140|      0|#define DIV_LUT_BITS 8
  ------------------
  183|   276k|  assert(f <= DIV_LUT_NUM);
  184|   276k|  *shift += DIV_LUT_PREC_BITS;
  ------------------
  |  |  139|   276k|#define DIV_LUT_PREC_BITS 14
  ------------------
  185|       |  // Use f as lookup into the precomputed table of multipliers
  186|   276k|  return div_lut[f];
  187|   276k|}
warped_motion.c:get_mult_shift_diag:
  787|   552k|static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
  788|   552k|  int64_t v = Px * (int64_t)iDet;
  789|   552k|  return (int32_t)clamp64(
  790|   552k|      ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
  ------------------
  |  |   58|   552k|  (((value) < 0) ? -ROUND_POWER_OF_TWO_64(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   53|  6.66k|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (58:4): [True: 6.66k, False: 545k]
  |  |  ------------------
  |  |   59|   552k|                 : ROUND_POWER_OF_TWO_64((value), (n)))
  |  |  ------------------
  |  |  |  |   53|   545k|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  791|   552k|      (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
  ------------------
  |  |   96|   552k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
                    (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
  ------------------
  |  |   99|   552k|#define WARPEDMODEL_NONDIAGAFFINE_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 3))
  |  |  ------------------
  |  |  |  |   96|   552k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  ------------------
  792|   552k|      (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
  ------------------
  |  |   96|   552k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
                    (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
  ------------------
  |  |   99|   552k|#define WARPEDMODEL_NONDIAGAFFINE_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 3))
  |  |  ------------------
  |  |  |  |   96|   552k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  ------------------
  793|   552k|}
warped_motion.c:get_mult_shift_ndiag:
  780|   552k|static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
  781|   552k|  int64_t v = Px * (int64_t)iDet;
  782|   552k|  return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
  ------------------
  |  |   58|   552k|  (((value) < 0) ? -ROUND_POWER_OF_TWO_64(-(value), (n)) \
  |  |  ------------------
  |  |  |  |   53|   127k|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (58:4): [True: 127k, False: 424k]
  |  |  ------------------
  |  |   59|   552k|                 : ROUND_POWER_OF_TWO_64((value), (n)))
  |  |  ------------------
  |  |  |  |   53|   424k|  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  ------------------
  783|   552k|                          -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
  ------------------
  |  |   99|   552k|#define WARPEDMODEL_NONDIAGAFFINE_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 3))
  |  |  ------------------
  |  |  |  |   96|   552k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  ------------------
  784|   552k|                          WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
  ------------------
  |  |   99|   552k|#define WARPEDMODEL_NONDIAGAFFINE_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 3))
  |  |  ------------------
  |  |  |  |   96|   552k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  ------------------
  785|   552k|}

av1_convolve_horiz_rs_sse4_1:
   28|  22.2k|                                  int x_step_qn) {
   29|  22.2k|  assert(UPSCALE_NORMATIVE_TAPS == 8);
   30|       |
   31|  22.2k|  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
  ------------------
  |  |  101|  22.2k|#define UPSCALE_NORMATIVE_TAPS 8
  ------------------
   32|       |
   33|  22.2k|  const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
  ------------------
  |  |   21|  22.2k|#define FILTER_BITS 7
  ------------------
   34|  22.2k|  const __m128i zero = _mm_setzero_si128();
   35|       |
   36|  22.2k|  const uint8_t *src_y;
   37|  22.2k|  uint8_t *dst_y;
   38|  22.2k|  int x_qn = x0_qn;
   39|  1.32M|  for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) {
  ------------------
  |  Branch (39:19): [True: 1.30M, False: 22.2k]
  ------------------
   40|  1.30M|    const int x_filter_idx0 =
   41|  1.30M|        ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
  ------------------
  |  |   37|  1.30M|#define RS_SCALE_SUBPEL_MASK ((1 << RS_SCALE_SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   36|  1.30M|#define RS_SCALE_SUBPEL_BITS 14
  |  |  ------------------
  ------------------
                      ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
  ------------------
  |  |   38|  1.30M|#define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   36|  1.30M|#define RS_SCALE_SUBPEL_BITS 14
  |  |  ------------------
  |  |               #define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   34|  1.30M|#define RS_SUBPEL_BITS 6
  |  |  ------------------
  ------------------
   42|  1.30M|    const int x_filter_idx1 =
   43|  1.30M|        ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
  ------------------
  |  |   37|  1.30M|#define RS_SCALE_SUBPEL_MASK ((1 << RS_SCALE_SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   36|  1.30M|#define RS_SCALE_SUBPEL_BITS 14
  |  |  ------------------
  ------------------
                      ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
  ------------------
  |  |   38|  1.30M|#define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   36|  1.30M|#define RS_SCALE_SUBPEL_BITS 14
  |  |  ------------------
  |  |               #define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   34|  1.30M|#define RS_SUBPEL_BITS 6
  |  |  ------------------
  ------------------
   44|  1.30M|    const int x_filter_idx2 =
   45|  1.30M|        ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
  ------------------
  |  |   37|  1.30M|#define RS_SCALE_SUBPEL_MASK ((1 << RS_SCALE_SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   36|  1.30M|#define RS_SCALE_SUBPEL_BITS 14
  |  |  ------------------
  ------------------
                      ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
  ------------------
  |  |   38|  1.30M|#define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   36|  1.30M|#define RS_SCALE_SUBPEL_BITS 14
  |  |  ------------------
  |  |               #define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   34|  1.30M|#define RS_SUBPEL_BITS 6
  |  |  ------------------
  ------------------
   46|  1.30M|    const int x_filter_idx3 =
   47|  1.30M|        ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
  ------------------
  |  |   37|  1.30M|#define RS_SCALE_SUBPEL_MASK ((1 << RS_SCALE_SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   36|  1.30M|#define RS_SCALE_SUBPEL_BITS 14
  |  |  ------------------
  ------------------
                      ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
  ------------------
  |  |   38|  1.30M|#define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   36|  1.30M|#define RS_SCALE_SUBPEL_BITS 14
  |  |  ------------------
  |  |               #define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   34|  1.30M|#define RS_SUBPEL_BITS 6
  |  |  ------------------
  ------------------
   48|       |
   49|  1.30M|    assert(x_filter_idx0 <= RS_SUBPEL_MASK);
   50|  1.30M|    assert(x_filter_idx1 <= RS_SUBPEL_MASK);
   51|  1.30M|    assert(x_filter_idx2 <= RS_SUBPEL_MASK);
   52|  1.30M|    assert(x_filter_idx3 <= RS_SUBPEL_MASK);
   53|       |
   54|  1.30M|    const int16_t *const x_filter0 =
   55|  1.30M|        &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS];
  ------------------
  |  |  101|  1.30M|#define UPSCALE_NORMATIVE_TAPS 8
  ------------------
   56|  1.30M|    const int16_t *const x_filter1 =
   57|  1.30M|        &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS];
  ------------------
  |  |  101|  1.30M|#define UPSCALE_NORMATIVE_TAPS 8
  ------------------
   58|  1.30M|    const int16_t *const x_filter2 =
   59|  1.30M|        &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS];
  ------------------
  |  |  101|  1.30M|#define UPSCALE_NORMATIVE_TAPS 8
  ------------------
   60|  1.30M|    const int16_t *const x_filter3 =
   61|  1.30M|        &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS];
  ------------------
  |  |  101|  1.30M|#define UPSCALE_NORMATIVE_TAPS 8
  ------------------
   62|       |
   63|  1.30M|    const __m128i fil0_16 = xx_loadu_128(x_filter0);
   64|  1.30M|    const __m128i fil1_16 = xx_loadu_128(x_filter1);
   65|  1.30M|    const __m128i fil2_16 = xx_loadu_128(x_filter2);
   66|  1.30M|    const __m128i fil3_16 = xx_loadu_128(x_filter3);
   67|       |
   68|  1.30M|    src_y = src;
   69|  1.30M|    dst_y = dst;
   70|  49.4M|    for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) {
  ------------------
  |  Branch (70:21): [True: 48.1M, False: 1.30M]
  ------------------
   71|  48.1M|      const uint8_t *const src_x0 =
   72|  48.1M|          &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
  ------------------
  |  |   36|  48.1M|#define RS_SCALE_SUBPEL_BITS 14
  ------------------
   73|  48.1M|      const uint8_t *const src_x1 =
   74|  48.1M|          &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
  ------------------
  |  |   36|  48.1M|#define RS_SCALE_SUBPEL_BITS 14
  ------------------
   75|  48.1M|      const uint8_t *const src_x2 =
   76|  48.1M|          &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
  ------------------
  |  |   36|  48.1M|#define RS_SCALE_SUBPEL_BITS 14
  ------------------
   77|  48.1M|      const uint8_t *const src_x3 =
   78|  48.1M|          &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
  ------------------
  |  |   36|  48.1M|#define RS_SCALE_SUBPEL_BITS 14
  ------------------
   79|       |
   80|       |      // Load up the source data. This is 8-bit input data, so each load
   81|       |      // gets 8 pixels.
   82|  48.1M|      const __m128i src0_8 = xx_loadl_64(src_x0);
   83|  48.1M|      const __m128i src1_8 = xx_loadl_64(src_x1);
   84|  48.1M|      const __m128i src2_8 = xx_loadl_64(src_x2);
   85|  48.1M|      const __m128i src3_8 = xx_loadl_64(src_x3);
   86|       |
   87|       |      // Now zero-extend up to 16-bit precision, i.e.
   88|       |      // [ 00 00 00 00 hg fe dc ba ] -> [ 0h 0g 0f 0e 0d 0c 0b 0a ]
   89|  48.1M|      const __m128i src0_16 = _mm_cvtepu8_epi16(src0_8);
   90|  48.1M|      const __m128i src1_16 = _mm_cvtepu8_epi16(src1_8);
   91|  48.1M|      const __m128i src2_16 = _mm_cvtepu8_epi16(src2_8);
   92|  48.1M|      const __m128i src3_16 = _mm_cvtepu8_epi16(src3_8);
   93|       |
   94|       |      // Multiply by filter coefficients (results in a 32-bit value),
   95|       |      // and add adjacent pairs, i.e.
   96|       |      // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ])
   97|       |      // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ]
   98|  48.1M|      const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16);
   99|  48.1M|      const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16);
  100|  48.1M|      const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16);
  101|  48.1M|      const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16);
  102|       |
  103|       |      // Reduce horizontally and add, i.e.
  104|       |      // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ]
  105|  48.1M|      const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32);
  106|  48.1M|      const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32);
  107|       |
  108|  48.1M|      const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32);
  109|       |
  110|       |      // Divide down by (1 << FILTER_BITS), rounding to nearest.
  111|  48.1M|      const __m128i shifted_32 =
  112|  48.1M|          _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS);
  ------------------
  |  |   21|  48.1M|#define FILTER_BITS 7
  ------------------
  113|       |
  114|       |      // Pack 32-bit values into 16-bit values, i.e.
  115|       |      // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ]
  116|  48.1M|      const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero);
  117|       |
  118|       |      // Pack 16-bit values into 8-bit values, i.e.
  119|       |      // ([ 0 0 0 0 D C B A ], [ 0 0 0 0 0 0 0 0 ])
  120|       |      // -> [ 0 0 0 0 0 0 DC BA ]
  121|  48.1M|      const __m128i shifted_8 = _mm_packus_epi16(shifted_16, zero);
  122|       |
  123|       |      // Write to the output
  124|  48.1M|      xx_storel_32(&dst_y[x], shifted_8);
  125|  48.1M|    }
  126|  1.30M|  }
  127|  22.2k|}
av1_highbd_convolve_horiz_rs_sse4_1:
  137|  23.8k|                                         int x0_qn, int x_step_qn, int bd) {
  138|  23.8k|  assert(UPSCALE_NORMATIVE_TAPS == 8);
  139|  23.8k|  assert(bd == 8 || bd == 10 || bd == 12);
  140|       |
  141|  23.8k|  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
  ------------------
  |  |  101|  23.8k|#define UPSCALE_NORMATIVE_TAPS 8
  ------------------
  142|       |
  143|  23.8k|  const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
  ------------------
  |  |   21|  23.8k|#define FILTER_BITS 7
  ------------------
  144|  23.8k|  const __m128i zero = _mm_setzero_si128();
  145|  23.8k|  const __m128i clip_maximum = _mm_set1_epi16((1 << bd) - 1);
  146|       |
  147|  23.8k|  const uint16_t *src_y;
  148|  23.8k|  uint16_t *dst_y;
  149|  23.8k|  int x_qn = x0_qn;
  150|  1.24M|  for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) {
  ------------------
  |  Branch (150:19): [True: 1.22M, False: 23.8k]
  ------------------
  151|  1.22M|    const int x_filter_idx0 =
  152|  1.22M|        ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
  ------------------
  |  |   37|  1.22M|#define RS_SCALE_SUBPEL_MASK ((1 << RS_SCALE_SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   36|  1.22M|#define RS_SCALE_SUBPEL_BITS 14
  |  |  ------------------
  ------------------
                      ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
  ------------------
  |  |   38|  1.22M|#define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   36|  1.22M|#define RS_SCALE_SUBPEL_BITS 14
  |  |  ------------------
  |  |               #define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   34|  1.22M|#define RS_SUBPEL_BITS 6
  |  |  ------------------
  ------------------
  153|  1.22M|    const int x_filter_idx1 =
  154|  1.22M|        ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
  ------------------
  |  |   37|  1.22M|#define RS_SCALE_SUBPEL_MASK ((1 << RS_SCALE_SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   36|  1.22M|#define RS_SCALE_SUBPEL_BITS 14
  |  |  ------------------
  ------------------
                      ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
  ------------------
  |  |   38|  1.22M|#define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   36|  1.22M|#define RS_SCALE_SUBPEL_BITS 14
  |  |  ------------------
  |  |               #define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   34|  1.22M|#define RS_SUBPEL_BITS 6
  |  |  ------------------
  ------------------
  155|  1.22M|    const int x_filter_idx2 =
  156|  1.22M|        ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
  ------------------
  |  |   37|  1.22M|#define RS_SCALE_SUBPEL_MASK ((1 << RS_SCALE_SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   36|  1.22M|#define RS_SCALE_SUBPEL_BITS 14
  |  |  ------------------
  ------------------
                      ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
  ------------------
  |  |   38|  1.22M|#define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   36|  1.22M|#define RS_SCALE_SUBPEL_BITS 14
  |  |  ------------------
  |  |               #define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   34|  1.22M|#define RS_SUBPEL_BITS 6
  |  |  ------------------
  ------------------
  157|  1.22M|    const int x_filter_idx3 =
  158|  1.22M|        ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
  ------------------
  |  |   37|  1.22M|#define RS_SCALE_SUBPEL_MASK ((1 << RS_SCALE_SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   36|  1.22M|#define RS_SCALE_SUBPEL_BITS 14
  |  |  ------------------
  ------------------
                      ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
  ------------------
  |  |   38|  1.22M|#define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   36|  1.22M|#define RS_SCALE_SUBPEL_BITS 14
  |  |  ------------------
  |  |               #define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   34|  1.22M|#define RS_SUBPEL_BITS 6
  |  |  ------------------
  ------------------
  159|       |
  160|  1.22M|    assert(x_filter_idx0 <= RS_SUBPEL_MASK);
  161|  1.22M|    assert(x_filter_idx1 <= RS_SUBPEL_MASK);
  162|  1.22M|    assert(x_filter_idx2 <= RS_SUBPEL_MASK);
  163|  1.22M|    assert(x_filter_idx3 <= RS_SUBPEL_MASK);
  164|       |
  165|  1.22M|    const int16_t *const x_filter0 =
  166|  1.22M|        &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS];
  ------------------
  |  |  101|  1.22M|#define UPSCALE_NORMATIVE_TAPS 8
  ------------------
  167|  1.22M|    const int16_t *const x_filter1 =
  168|  1.22M|        &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS];
  ------------------
  |  |  101|  1.22M|#define UPSCALE_NORMATIVE_TAPS 8
  ------------------
  169|  1.22M|    const int16_t *const x_filter2 =
  170|  1.22M|        &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS];
  ------------------
  |  |  101|  1.22M|#define UPSCALE_NORMATIVE_TAPS 8
  ------------------
  171|  1.22M|    const int16_t *const x_filter3 =
  172|  1.22M|        &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS];
  ------------------
  |  |  101|  1.22M|#define UPSCALE_NORMATIVE_TAPS 8
  ------------------
  173|       |
  174|  1.22M|    const __m128i fil0_16 = xx_loadu_128(x_filter0);
  175|  1.22M|    const __m128i fil1_16 = xx_loadu_128(x_filter1);
  176|  1.22M|    const __m128i fil2_16 = xx_loadu_128(x_filter2);
  177|  1.22M|    const __m128i fil3_16 = xx_loadu_128(x_filter3);
  178|       |
  179|  1.22M|    src_y = src;
  180|  1.22M|    dst_y = dst;
  181|   137M|    for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) {
  ------------------
  |  Branch (181:21): [True: 136M, False: 1.22M]
  ------------------
  182|   136M|      const uint16_t *const src_x0 =
  183|   136M|          &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
  ------------------
  |  |   36|   136M|#define RS_SCALE_SUBPEL_BITS 14
  ------------------
  184|   136M|      const uint16_t *const src_x1 =
  185|   136M|          &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
  ------------------
  |  |   36|   136M|#define RS_SCALE_SUBPEL_BITS 14
  ------------------
  186|   136M|      const uint16_t *const src_x2 =
  187|   136M|          &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
  ------------------
  |  |   36|   136M|#define RS_SCALE_SUBPEL_BITS 14
  ------------------
  188|   136M|      const uint16_t *const src_x3 =
  189|   136M|          &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
  ------------------
  |  |   36|   136M|#define RS_SCALE_SUBPEL_BITS 14
  ------------------
  190|       |
  191|       |      // Load up the source data. This is 16-bit input data, so each load
  192|       |      // gets 8 pixels.
  193|   136M|      const __m128i src0_16 = xx_loadu_128(src_x0);
  194|   136M|      const __m128i src1_16 = xx_loadu_128(src_x1);
  195|   136M|      const __m128i src2_16 = xx_loadu_128(src_x2);
  196|   136M|      const __m128i src3_16 = xx_loadu_128(src_x3);
  197|       |
  198|       |      // Multiply by filter coefficients (results in a 32-bit value),
  199|       |      // and add adjacent pairs, i.e.
  200|       |      // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ])
  201|       |      // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ]
  202|   136M|      const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16);
  203|   136M|      const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16);
  204|   136M|      const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16);
  205|   136M|      const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16);
  206|       |
  207|       |      // Reduce horizontally and add, i.e.
  208|       |      // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ]
  209|   136M|      const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32);
  210|   136M|      const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32);
  211|       |
  212|   136M|      const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32);
  213|       |
  214|       |      // Divide down by (1 << FILTER_BITS), rounding to nearest.
  215|   136M|      const __m128i shifted_32 =
  216|   136M|          _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS);
  ------------------
  |  |   21|   136M|#define FILTER_BITS 7
  ------------------
  217|       |
  218|       |      // Pack 32-bit values into 16-bit values, i.e.
  219|       |      // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ]
  220|   136M|      const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero);
  221|       |
  222|       |      // Clip the values at (1 << bd) - 1
  223|   136M|      const __m128i clipped_16 = _mm_min_epi16(shifted_16, clip_maximum);
  224|       |
  225|       |      // Write to the output
  226|   136M|      xx_storel_64(&dst_y[x], clipped_16);
  227|   136M|    }
  228|  1.22M|  }
  229|  23.8k|}

av1_convolve_2d_scale_sse4_1:
  238|  1.59M|                                  ConvolveParams *conv_params) {
  239|  1.59M|  int16_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
  240|  1.59M|  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
  ------------------
  |  |   28|  1.59M|#define SCALE_SUBPEL_BITS 10
  ------------------
  241|  1.59M|             filter_params_y->taps;
  242|       |
  243|  1.59M|  const int xtaps = filter_params_x->taps;
  244|  1.59M|  const int ytaps = filter_params_y->taps;
  245|  1.59M|  const int fo_vert = ytaps / 2 - 1;
  246|  1.59M|  assert((xtaps == 8) && (ytaps == 8));
  247|  1.59M|  (void)xtaps;
  248|       |
  249|       |  // horizontal filter
  250|  1.59M|  hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h, subpel_x_qn,
  251|  1.59M|           x_step_qn, filter_params_x, conv_params->round_0);
  252|       |
  253|       |  // vertical filter (input is transposed)
  254|  1.59M|  vfilter8(tmp, im_h, dst8, dst8_stride, w, h, subpel_y_qn, y_step_qn,
  255|  1.59M|           filter_params_y, conv_params, 8);
  256|  1.59M|}
av1_highbd_convolve_2d_scale_sse4_1:
  477|   724k|    ConvolveParams *conv_params, int bd) {
  478|       |  // TODO(yaowu): Move this out of stack
  479|   724k|  DECLARE_ALIGNED(16, int16_t,
  ------------------
  |  |   19|   724k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
  480|   724k|                  tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
  481|   724k|  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
  ------------------
  |  |   28|   724k|#define SCALE_SUBPEL_BITS 10
  ------------------
  482|   724k|             filter_params_y->taps;
  483|   724k|  const int xtaps = filter_params_x->taps;
  484|   724k|  const int ytaps = filter_params_y->taps;
  485|   724k|  const int fo_vert = ytaps / 2 - 1;
  486|       |
  487|   724k|  memset(tmp, 0, sizeof(tmp));
  488|   724k|  assert((xtaps == 8) && (ytaps == 8));
  489|   724k|  (void)xtaps;
  490|       |
  491|       |  // horizontal filter
  492|   724k|  highbd_hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h,
  493|   724k|                  subpel_x_qn, x_step_qn, filter_params_x, conv_params->round_0,
  494|   724k|                  bd);
  495|       |
  496|       |  // vertical filter (input is transposed)
  497|   724k|  highbd_vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
  498|   724k|                  filter_params_y, conv_params, bd);
  499|   724k|}
av1_convolve_scale_sse4.c:hfilter8:
   25|  1.59M|                     const InterpFilterParams *filter_params, int round) {
   26|  1.59M|  const int bd = 8;
   27|  1.59M|  const int ntaps = 8;
   28|       |
   29|  1.59M|  src -= ntaps / 2 - 1;
   30|       |
   31|  1.59M|  int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
  ------------------
  |  |   21|  1.59M|#define FILTER_BITS 7
  ------------------
   32|  1.59M|  const __m128i round_add = _mm_set1_epi32(round_add32);
   33|  1.59M|  const __m128i round_shift = _mm_cvtsi32_si128(round);
   34|       |
   35|  1.59M|  int x_qn = subpel_x_qn;
   36|  14.4M|  for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
  ------------------
  |  Branch (36:19): [True: 12.8M, False: 1.59M]
  ------------------
   37|  12.8M|    const uint8_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
  ------------------
  |  |   28|  12.8M|#define SCALE_SUBPEL_BITS 10
  ------------------
   38|  12.8M|    const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
  ------------------
  |  |   30|  12.8M|#define SCALE_SUBPEL_MASK (SCALE_SUBPEL_SHIFTS - 1)
  |  |  ------------------
  |  |  |  |   29|  12.8M|#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   28|  12.8M|#define SCALE_SUBPEL_BITS 10
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
                  const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
  ------------------
  |  |   31|  12.8M|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|  12.8M|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|  12.8M|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
   39|  12.8M|    assert(filter_idx < SUBPEL_SHIFTS);
   40|  12.8M|    const int16_t *filter =
   41|  12.8M|        av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
   42|       |
   43|       |    // Load the filter coefficients
   44|  12.8M|    const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
   45|  12.8M|    const __m128i zero = _mm_castps_si128(_mm_setzero_ps());
   46|       |
   47|  12.8M|    int y;
   48|  52.1M|    for (y = 0; y <= h - 4; y += 4) {
  ------------------
  |  Branch (48:17): [True: 39.3M, False: 12.8M]
  ------------------
   49|  39.3M|      const uint8_t *const src0 = src_col + y * src_stride;
   50|  39.3M|      const uint8_t *const src1 = src0 + 1 * src_stride;
   51|  39.3M|      const uint8_t *const src2 = src0 + 2 * src_stride;
   52|  39.3M|      const uint8_t *const src3 = src0 + 3 * src_stride;
   53|       |
   54|       |      // Load up source data. This is 8-bit input data; each load is just
   55|       |      // loading the lower half of the register and gets 8 pixels
   56|  39.3M|      const __m128i data08 = _mm_loadl_epi64((__m128i *)src0);
   57|  39.3M|      const __m128i data18 = _mm_loadl_epi64((__m128i *)src1);
   58|  39.3M|      const __m128i data28 = _mm_loadl_epi64((__m128i *)src2);
   59|  39.3M|      const __m128i data38 = _mm_loadl_epi64((__m128i *)src3);
   60|       |
   61|       |      // Now zero-extend up to 16-bit precision by interleaving with
   62|       |      // zeros. Drop the upper half of each register (which just had zeros)
   63|  39.3M|      const __m128i data0lo = _mm_unpacklo_epi8(data08, zero);
   64|  39.3M|      const __m128i data1lo = _mm_unpacklo_epi8(data18, zero);
   65|  39.3M|      const __m128i data2lo = _mm_unpacklo_epi8(data28, zero);
   66|  39.3M|      const __m128i data3lo = _mm_unpacklo_epi8(data38, zero);
   67|       |
   68|       |      // Multiply by coefficients
   69|  39.3M|      const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
   70|  39.3M|      const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
   71|  39.3M|      const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
   72|  39.3M|      const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
   73|       |
   74|       |      // Reduce horizontally and add
   75|  39.3M|      const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
   76|  39.3M|      const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
   77|  39.3M|      const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);
   78|       |
   79|       |      // Divide down by (1 << round), rounding to nearest.
   80|  39.3M|      __m128i shifted =
   81|  39.3M|          _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
   82|       |
   83|  39.3M|      shifted = _mm_packus_epi32(shifted, shifted);
   84|       |      // Write transposed to the output
   85|  39.3M|      _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted);
   86|  39.3M|    }
   87|  35.3M|    for (; y < h; ++y) {
  ------------------
  |  Branch (87:12): [True: 22.4M, False: 12.8M]
  ------------------
   88|  22.4M|      const uint8_t *const src_row = src_col + y * src_stride;
   89|       |
   90|  22.4M|      int32_t sum = (1 << (bd + FILTER_BITS - 1));
  ------------------
  |  |   21|  22.4M|#define FILTER_BITS 7
  ------------------
   91|   201M|      for (int k = 0; k < ntaps; ++k) {
  ------------------
  |  Branch (91:23): [True: 179M, False: 22.4M]
  ------------------
   92|   179M|        sum += filter[k] * src_row[k];
   93|   179M|      }
   94|       |
   95|  22.4M|      dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
  ------------------
  |  |   41|  22.4M|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
   96|  22.4M|    }
   97|  12.8M|  }
   98|  1.59M|}
av1_convolve_scale_sse4.c:vfilter8:
  110|  1.59M|                     const ConvolveParams *conv_params, int bd) {
  111|  1.59M|  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
  ------------------
  |  |   21|  1.59M|#define FILTER_BITS 7
  ------------------
  112|  1.59M|  const int ntaps = 8;
  113|       |
  114|  1.59M|  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
  115|       |
  116|  1.59M|  const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
  117|  1.59M|                         (1 << (offset_bits - conv_params->round_1 - 1)));
  118|  1.59M|  const __m128i sub = _mm_set1_epi16(sub32);
  119|       |
  120|  1.59M|  CONV_BUF_TYPE *dst16 = conv_params->dst;
  121|  1.59M|  const int dst16_stride = conv_params->dst_stride;
  122|  1.59M|  const int bits =
  123|  1.59M|      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|  1.59M|#define FILTER_BITS 7
  ------------------
  124|  1.59M|  const __m128i bits_shift = _mm_cvtsi32_si128(bits);
  125|  1.59M|  const __m128i bits_const = _mm_set1_epi16(((1 << bits) >> 1));
  126|  1.59M|  const __m128i round_shift_add =
  127|  1.59M|      _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
  128|  1.59M|  const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits);
  129|       |
  130|  1.59M|  const int w0 = conv_params->fwd_offset;
  131|  1.59M|  const int w1 = conv_params->bck_offset;
  132|  1.59M|  const __m128i wt0 = _mm_set1_epi16((short)w0);
  133|  1.59M|  const __m128i wt1 = _mm_set1_epi16((short)w1);
  134|  1.59M|  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
  135|       |
  136|  1.59M|  int y_qn = subpel_y_qn;
  137|  15.0M|  for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
  ------------------
  |  Branch (137:19): [True: 13.4M, False: 1.59M]
  ------------------
  138|  13.4M|    const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
  ------------------
  |  |   28|  13.4M|#define SCALE_SUBPEL_BITS 10
  ------------------
  139|  13.4M|    const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
  ------------------
  |  |   30|  13.4M|#define SCALE_SUBPEL_MASK (SCALE_SUBPEL_SHIFTS - 1)
  |  |  ------------------
  |  |  |  |   29|  13.4M|#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   28|  13.4M|#define SCALE_SUBPEL_BITS 10
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
                  const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
  ------------------
  |  |   31|  13.4M|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|  13.4M|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|  13.4M|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  140|  13.4M|    assert(filter_idx < SUBPEL_SHIFTS);
  141|  13.4M|    const int16_t *filter =
  142|  13.4M|        av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
  143|       |
  144|  13.4M|    const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
  145|  13.4M|    int x;
  146|  63.6M|    for (x = 0; x <= w - 4; x += 4) {
  ------------------
  |  Branch (146:17): [True: 50.1M, False: 13.4M]
  ------------------
  147|  50.1M|      const int16_t *const src0 = src_y + x * src_stride;
  148|  50.1M|      const int16_t *const src1 = src0 + 1 * src_stride;
  149|  50.1M|      const int16_t *const src2 = src0 + 2 * src_stride;
  150|  50.1M|      const int16_t *const src3 = src0 + 3 * src_stride;
  151|       |
  152|       |      // Load the source data for the three rows, adding the three registers of
  153|       |      // convolved products to one as we go (conv0..conv3) to avoid the
  154|       |      // register pressure getting too high.
  155|  50.1M|      const __m128i conv0 = convolve_16_8(src0, coeff0716);
  156|  50.1M|      const __m128i conv1 = convolve_16_8(src1, coeff0716);
  157|  50.1M|      const __m128i conv2 = convolve_16_8(src2, coeff0716);
  158|  50.1M|      const __m128i conv3 = convolve_16_8(src3, coeff0716);
  159|       |
  160|       |      // Now reduce horizontally to get one lane for each result
  161|  50.1M|      const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
  162|  50.1M|      const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
  163|  50.1M|      __m128i conv = _mm_hadd_epi32(conv01, conv23);
  164|       |
  165|  50.1M|      conv = _mm_add_epi32(conv, res_add_const);
  166|       |      // Divide down by (1 << round_1), rounding to nearest and subtract sub32.
  167|  50.1M|      __m128i shifted =
  168|  50.1M|          _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift);
  169|       |
  170|  50.1M|      uint8_t *dst_x = dst + y * dst_stride + x;
  171|  50.1M|      __m128i result;
  172|  50.1M|      __m128i shifted_16 = _mm_packus_epi32(shifted, shifted);
  173|       |
  174|  50.1M|      if (conv_params->is_compound) {
  ------------------
  |  Branch (174:11): [True: 16.6M, False: 33.4M]
  ------------------
  175|  16.6M|        CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x;
  176|  16.6M|        if (conv_params->do_average) {
  ------------------
  |  Branch (176:13): [True: 4.37M, False: 12.3M]
  ------------------
  177|  4.37M|          const __m128i p_16 = _mm_loadl_epi64((__m128i *)dst_16_x);
  178|  4.37M|          if (conv_params->use_dist_wtd_comp_avg) {
  ------------------
  |  Branch (178:15): [True: 931k, False: 3.44M]
  ------------------
  179|   931k|            const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, shifted_16);
  180|   931k|            const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt);
  181|   931k|            const __m128i shifted_32 =
  182|   931k|                _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
  ------------------
  |  |   76|   931k|#define DIST_PRECISION_BITS 4
  ------------------
  183|   931k|            shifted_16 = _mm_packus_epi32(shifted_32, shifted_32);
  184|  3.44M|          } else {
  185|  3.44M|            shifted_16 = _mm_srai_epi16(_mm_add_epi16(p_16, shifted_16), 1);
  186|  3.44M|          }
  187|  4.37M|          const __m128i subbed = _mm_sub_epi16(shifted_16, sub);
  188|  4.37M|          result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift);
  189|  4.37M|          const __m128i result_8 = _mm_packus_epi16(result, result);
  190|  4.37M|          *(int *)dst_x = _mm_cvtsi128_si32(result_8);
  191|  12.3M|        } else {
  192|  12.3M|          _mm_storel_epi64((__m128i *)dst_16_x, shifted_16);
  193|  12.3M|        }
  194|  33.4M|      } else {
  195|  33.4M|        const __m128i subbed = _mm_sub_epi16(shifted_16, sub);
  196|  33.4M|        result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift);
  197|  33.4M|        const __m128i result_8 = _mm_packus_epi16(result, result);
  198|  33.4M|        *(int *)dst_x = _mm_cvtsi128_si32(result_8);
  199|  33.4M|      }
  200|  50.1M|    }
  201|  14.6M|    for (; x < w; ++x) {
  ------------------
  |  Branch (201:12): [True: 1.18M, False: 13.4M]
  ------------------
  202|  1.18M|      const int16_t *src_x = src_y + x * src_stride;
  203|  1.18M|      int32_t sum = 1 << offset_bits;
  204|  10.6M|      for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
  ------------------
  |  Branch (204:23): [True: 9.47M, False: 1.18M]
  ------------------
  205|  1.18M|      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
  ------------------
  |  |   41|  1.18M|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  206|       |
  207|  1.18M|      if (conv_params->is_compound) {
  ------------------
  |  Branch (207:11): [True: 0, False: 1.18M]
  ------------------
  208|      0|        if (conv_params->do_average) {
  ------------------
  |  Branch (208:13): [True: 0, False: 0]
  ------------------
  209|      0|          int32_t tmp = dst16[y * dst16_stride + x];
  210|      0|          if (conv_params->use_dist_wtd_comp_avg) {
  ------------------
  |  Branch (210:15): [True: 0, False: 0]
  ------------------
  211|      0|            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
  212|      0|            tmp = tmp >> DIST_PRECISION_BITS;
  ------------------
  |  |   76|      0|#define DIST_PRECISION_BITS 4
  ------------------
  213|      0|          } else {
  214|      0|            tmp += res;
  215|      0|            tmp = tmp >> 1;
  216|      0|          }
  217|       |          /* Subtract round offset and convolve round */
  218|      0|          tmp = tmp - sub32;
  219|      0|          dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
  ------------------
  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  220|      0|        } else {
  221|      0|          dst16[y * dst16_stride + x] = res;
  222|      0|        }
  223|  1.18M|      } else {
  224|       |        /* Subtract round offset and convolve round */
  225|  1.18M|        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
  226|  1.18M|                             (1 << (offset_bits - conv_params->round_1 - 1)));
  227|  1.18M|        dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
  ------------------
  |  |   41|  1.18M|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  228|  1.18M|      }
  229|  1.18M|    }
  230|  13.4M|  }
  231|  1.59M|}
av1_convolve_scale_sse4.c:convolve_16_8:
  100|   575M|static __m128i convolve_16_8(const int16_t *src, __m128i coeff) {
  101|   575M|  __m128i data = _mm_loadu_si128((__m128i *)src);
  102|   575M|  return _mm_madd_epi16(data, coeff);
  103|   575M|}
av1_convolve_scale_sse4.c:highbd_hfilter8:
  265|   724k|                            int bd) {
  266|   724k|  const int ntaps = 8;
  267|       |
  268|   724k|  src -= ntaps / 2 - 1;
  269|       |
  270|   724k|  int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
  ------------------
  |  |   21|   724k|#define FILTER_BITS 7
  ------------------
  271|   724k|  const __m128i round_add = _mm_set1_epi32(round_add32);
  272|   724k|  const __m128i round_shift = _mm_cvtsi32_si128(round);
  273|       |
  274|   724k|  int x_qn = subpel_x_qn;
  275|  10.8M|  for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
  ------------------
  |  Branch (275:19): [True: 10.0M, False: 724k]
  ------------------
  276|  10.0M|    const uint16_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
  ------------------
  |  |   28|  10.0M|#define SCALE_SUBPEL_BITS 10
  ------------------
  277|  10.0M|    const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
  ------------------
  |  |   30|  10.0M|#define SCALE_SUBPEL_MASK (SCALE_SUBPEL_SHIFTS - 1)
  |  |  ------------------
  |  |  |  |   29|  10.0M|#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   28|  10.0M|#define SCALE_SUBPEL_BITS 10
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
                  const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
  ------------------
  |  |   31|  10.0M|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|  10.0M|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|  10.0M|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  278|  10.0M|    assert(filter_idx < SUBPEL_SHIFTS);
  279|  10.0M|    const int16_t *filter =
  280|  10.0M|        av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
  281|       |
  282|       |    // Load the filter coefficients
  283|  10.0M|    const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
  284|       |
  285|  10.0M|    int y;
  286|  72.8M|    for (y = 0; y <= h - 4; y += 4) {
  ------------------
  |  Branch (286:17): [True: 62.7M, False: 10.0M]
  ------------------
  287|  62.7M|      const uint16_t *const src0 = src_col + y * src_stride;
  288|  62.7M|      const uint16_t *const src1 = src0 + 1 * src_stride;
  289|  62.7M|      const uint16_t *const src2 = src0 + 2 * src_stride;
  290|  62.7M|      const uint16_t *const src3 = src0 + 3 * src_stride;
  291|       |
  292|       |      // Load up source data. This is 16-bit input data, so each load gets the 8
  293|       |      // pixels we need.
  294|  62.7M|      const __m128i data0lo = _mm_loadu_si128((__m128i *)src0);
  295|  62.7M|      const __m128i data1lo = _mm_loadu_si128((__m128i *)src1);
  296|  62.7M|      const __m128i data2lo = _mm_loadu_si128((__m128i *)src2);
  297|  62.7M|      const __m128i data3lo = _mm_loadu_si128((__m128i *)src3);
  298|       |
  299|       |      // Multiply by coefficients
  300|  62.7M|      const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
  301|  62.7M|      const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
  302|  62.7M|      const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
  303|  62.7M|      const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
  304|       |
  305|       |      // Reduce horizontally and add
  306|  62.7M|      const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
  307|  62.7M|      const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
  308|  62.7M|      const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);
  309|       |
  310|       |      // Divide down by (1 << round), rounding to nearest.
  311|  62.7M|      __m128i shifted =
  312|  62.7M|          _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
  313|       |
  314|  62.7M|      shifted = _mm_packus_epi32(shifted, shifted);
  315|       |      // Write transposed to the output
  316|  62.7M|      _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted);
  317|  62.7M|    }
  318|  24.4M|    for (; y < h; ++y) {
  ------------------
  |  Branch (318:12): [True: 14.3M, False: 10.0M]
  ------------------
  319|  14.3M|      const uint16_t *const src_row = src_col + y * src_stride;
  320|       |
  321|  14.3M|      int32_t sum = (1 << (bd + FILTER_BITS - 1));
  ------------------
  |  |   21|  14.3M|#define FILTER_BITS 7
  ------------------
  322|   129M|      for (int k = 0; k < ntaps; ++k) {
  ------------------
  |  Branch (322:23): [True: 114M, False: 14.3M]
  ------------------
  323|   114M|        sum += filter[k] * src_row[k];
  324|   114M|      }
  325|       |
  326|  14.3M|      dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
  ------------------
  |  |   41|  14.3M|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  327|  14.3M|    }
  328|  10.0M|  }
  329|   724k|}
av1_convolve_scale_sse4.c:highbd_vfilter8:
  337|   724k|                            const ConvolveParams *conv_params, int bd) {
  338|   724k|  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
  ------------------
  |  |   21|   724k|#define FILTER_BITS 7
  ------------------
  339|   724k|  const int ntaps = 8;
  340|       |
  341|   724k|  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
  342|       |
  343|   724k|  const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
  344|   724k|                         (1 << (offset_bits - conv_params->round_1 - 1)));
  345|   724k|  const __m128i sub = _mm_set1_epi32(sub32);
  346|       |
  347|   724k|  CONV_BUF_TYPE *dst16 = conv_params->dst;
  348|   724k|  const int dst16_stride = conv_params->dst_stride;
  349|   724k|  const __m128i clip_pixel_ =
  350|   724k|      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
  ------------------
  |  Branch (350:22): [True: 465k, False: 258k]
  |  Branch (350:41): [True: 258k, False: 0]
  ------------------
  351|   724k|  const int bits =
  352|   724k|      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|   724k|#define FILTER_BITS 7
  ------------------
  353|   724k|  const __m128i bits_shift = _mm_cvtsi32_si128(bits);
  354|   724k|  const __m128i bits_const = _mm_set1_epi32(((1 << bits) >> 1));
  355|   724k|  const __m128i round_shift_add =
  356|   724k|      _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
  357|   724k|  const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits);
  358|   724k|  const int round_bits =
  359|   724k|      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|   724k|#define FILTER_BITS 7
  ------------------
  360|   724k|  __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
  361|   724k|  __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1));
  362|       |
  363|   724k|  const int w0 = conv_params->fwd_offset;
  364|   724k|  const int w1 = conv_params->bck_offset;
  365|   724k|  const __m128i wt0 = _mm_set1_epi32(w0);
  366|   724k|  const __m128i wt1 = _mm_set1_epi32(w1);
  367|       |
  368|   724k|  int y_qn = subpel_y_qn;
  369|  11.0M|  for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
  ------------------
  |  Branch (369:19): [True: 10.3M, False: 724k]
  ------------------
  370|  10.3M|    const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
  ------------------
  |  |   28|  10.3M|#define SCALE_SUBPEL_BITS 10
  ------------------
  371|  10.3M|    const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
  ------------------
  |  |   30|  10.3M|#define SCALE_SUBPEL_MASK (SCALE_SUBPEL_SHIFTS - 1)
  |  |  ------------------
  |  |  |  |   29|  10.3M|#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   28|  10.3M|#define SCALE_SUBPEL_BITS 10
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
                  const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
  ------------------
  |  |   31|  10.3M|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|  10.3M|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|  10.3M|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  372|  10.3M|    assert(filter_idx < SUBPEL_SHIFTS);
  373|  10.3M|    const int16_t *filter =
  374|  10.3M|        av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
  375|       |
  376|  10.3M|    const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
  377|  10.3M|    int x;
  378|   103M|    for (x = 0; x <= w - 4; x += 4) {
  ------------------
  |  Branch (378:17): [True: 93.5M, False: 10.3M]
  ------------------
  379|  93.5M|      const int16_t *const src0 = src_y + x * src_stride;
  380|  93.5M|      const int16_t *const src1 = src0 + 1 * src_stride;
  381|  93.5M|      const int16_t *const src2 = src0 + 2 * src_stride;
  382|  93.5M|      const int16_t *const src3 = src0 + 3 * src_stride;
  383|       |
  384|       |      // Load the source data for the three rows, adding the three registers of
  385|       |      // convolved products to one as we go (conv0..conv3) to avoid the
  386|       |      // register pressure getting too high.
  387|  93.5M|      const __m128i conv0 = convolve_16_8(src0, coeff0716);
  388|  93.5M|      const __m128i conv1 = convolve_16_8(src1, coeff0716);
  389|  93.5M|      const __m128i conv2 = convolve_16_8(src2, coeff0716);
  390|  93.5M|      const __m128i conv3 = convolve_16_8(src3, coeff0716);
  391|       |
  392|       |      // Now reduce horizontally to get one lane for each result
  393|  93.5M|      const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
  394|  93.5M|      const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
  395|  93.5M|      __m128i conv = _mm_hadd_epi32(conv01, conv23);
  396|  93.5M|      conv = _mm_add_epi32(conv, res_add_const);
  397|       |
  398|       |      // Divide down by (1 << round_1), rounding to nearest and subtract sub32.
  399|  93.5M|      __m128i shifted =
  400|  93.5M|          _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift);
  401|       |
  402|  93.5M|      uint16_t *dst_x = dst + y * dst_stride + x;
  403|       |
  404|  93.5M|      __m128i result;
  405|  93.5M|      if (conv_params->is_compound) {
  ------------------
  |  Branch (405:11): [True: 11.9M, False: 81.6M]
  ------------------
  406|  11.9M|        CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x;
  407|  11.9M|        if (conv_params->do_average) {
  ------------------
  |  Branch (407:13): [True: 2.27M, False: 9.64M]
  ------------------
  408|  2.27M|          __m128i p_32 =
  409|  2.27M|              _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)dst_16_x));
  410|       |
  411|  2.27M|          if (conv_params->use_dist_wtd_comp_avg) {
  ------------------
  |  Branch (411:15): [True: 561k, False: 1.71M]
  ------------------
  412|   561k|            shifted = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
  413|   561k|                                    _mm_mullo_epi32(shifted, wt1));
  414|   561k|            shifted = _mm_srai_epi32(shifted, DIST_PRECISION_BITS);
  ------------------
  |  |   76|   561k|#define DIST_PRECISION_BITS 4
  ------------------
  415|  1.71M|          } else {
  416|  1.71M|            shifted = _mm_srai_epi32(_mm_add_epi32(p_32, shifted), 1);
  417|  1.71M|          }
  418|  2.27M|          result = _mm_sub_epi32(shifted, sub);
  419|  2.27M|          result = _mm_sra_epi32(_mm_add_epi32(result, round_bits_const),
  420|  2.27M|                                 round_bits_shift);
  421|       |
  422|  2.27M|          result = _mm_packus_epi32(result, result);
  423|  2.27M|          result = _mm_min_epi16(result, clip_pixel_);
  424|  2.27M|          _mm_storel_epi64((__m128i *)dst_x, result);
  425|  9.64M|        } else {
  426|  9.64M|          __m128i shifted_16 = _mm_packus_epi32(shifted, shifted);
  427|  9.64M|          _mm_storel_epi64((__m128i *)dst_16_x, shifted_16);
  428|  9.64M|        }
  429|  81.6M|      } else {
  430|  81.6M|        result = _mm_sub_epi32(shifted, sub);
  431|  81.6M|        result = _mm_sra_epi16(_mm_add_epi32(result, bits_const), bits_shift);
  432|  81.6M|        result = _mm_packus_epi32(result, result);
  433|  81.6M|        result = _mm_min_epi16(result, clip_pixel_);
  434|  81.6M|        _mm_storel_epi64((__m128i *)dst_x, result);
  435|  81.6M|      }
  436|  93.5M|    }
  437|       |
  438|  10.6M|    for (; x < w; ++x) {
  ------------------
  |  Branch (438:12): [True: 364k, False: 10.3M]
  ------------------
  439|   364k|      const int16_t *src_x = src_y + x * src_stride;
  440|   364k|      int32_t sum = 1 << offset_bits;
  441|  3.27M|      for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
  ------------------
  |  Branch (441:23): [True: 2.91M, False: 364k]
  ------------------
  442|   364k|      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
  ------------------
  |  |   41|   364k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  443|   364k|      if (conv_params->is_compound) {
  ------------------
  |  Branch (443:11): [True: 0, False: 364k]
  ------------------
  444|      0|        if (conv_params->do_average) {
  ------------------
  |  Branch (444:13): [True: 0, False: 0]
  ------------------
  445|      0|          int32_t tmp = dst16[y * dst16_stride + x];
  446|      0|          if (conv_params->use_dist_wtd_comp_avg) {
  ------------------
  |  Branch (446:15): [True: 0, False: 0]
  ------------------
  447|      0|            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
  448|      0|            tmp = tmp >> DIST_PRECISION_BITS;
  ------------------
  |  |   76|      0|#define DIST_PRECISION_BITS 4
  ------------------
  449|      0|          } else {
  450|      0|            tmp += res;
  451|      0|            tmp = tmp >> 1;
  452|      0|          }
  453|       |          /* Subtract round offset and convolve round */
  454|      0|          tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
  455|      0|                       (1 << (offset_bits - conv_params->round_1 - 1)));
  456|      0|          dst[y * dst_stride + x] =
  457|      0|              clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
  ------------------
  |  |   41|      0|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  458|      0|        } else {
  459|      0|          dst16[y * dst16_stride + x] = res;
  460|      0|        }
  461|   364k|      } else {
  462|       |        /* Subtract round offset and convolve round */
  463|   364k|        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
  464|   364k|                             (1 << (offset_bits - conv_params->round_1 - 1)));
  465|   364k|        dst[y * dst_stride + x] =
  466|   364k|            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
  ------------------
  |  |   41|   364k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  467|   364k|      }
  468|   364k|    }
  469|  10.3M|  }
  470|   724k|}

av1_lowbd_inv_txfm2d_add_avx2:
 2211|  6.02M|                                   int eob) {
 2212|  6.02M|  switch (tx_size) {
 2213|   588k|    case TX_4X4:
  ------------------
  |  Branch (2213:5): [True: 588k, False: 5.44M]
  ------------------
 2214|   987k|    case TX_4X8:
  ------------------
  |  Branch (2214:5): [True: 399k, False: 5.62M]
  ------------------
 2215|  1.53M|    case TX_8X4:
  ------------------
  |  Branch (2215:5): [True: 543k, False: 5.48M]
  ------------------
 2216|  1.77M|    case TX_8X16:
  ------------------
  |  Branch (2216:5): [True: 247k, False: 5.78M]
  ------------------
 2217|  2.22M|    case TX_16X8:
  ------------------
  |  Branch (2217:5): [True: 444k, False: 5.58M]
  ------------------
 2218|  2.47M|    case TX_4X16:
  ------------------
  |  Branch (2218:5): [True: 256k, False: 5.77M]
  ------------------
 2219|  2.99M|    case TX_16X4:
  ------------------
  |  Branch (2219:5): [True: 515k, False: 5.51M]
  ------------------
 2220|  3.08M|    case TX_8X32:
  ------------------
  |  Branch (2220:5): [True: 90.6k, False: 5.93M]
  ------------------
 2221|  3.48M|    case TX_32X8:
  ------------------
  |  Branch (2221:5): [True: 403k, False: 5.62M]
  ------------------
 2222|  3.48M|      av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
 2223|  3.48M|                                     eob);
 2224|  3.48M|      break;
 2225|   965k|    case TX_8X8:
  ------------------
  |  Branch (2225:5): [True: 965k, False: 5.06M]
  ------------------
 2226|   965k|      lowbd_inv_txfm2d_add_8x8_avx2(input, output, stride, tx_type, tx_size,
 2227|   965k|                                    eob);
 2228|   965k|      break;
 2229|   742k|    case TX_16X16:
  ------------------
  |  Branch (2229:5): [True: 742k, False: 5.28M]
  ------------------
 2230|  1.08M|    case TX_32X32:
  ------------------
  |  Branch (2230:5): [True: 340k, False: 5.68M]
  ------------------
 2231|  1.15M|    case TX_64X64:
  ------------------
  |  Branch (2231:5): [True: 76.0k, False: 5.95M]
  ------------------
 2232|  1.23M|    case TX_16X32:
  ------------------
  |  Branch (2232:5): [True: 73.4k, False: 5.95M]
  ------------------
 2233|  1.37M|    case TX_32X16:
  ------------------
  |  Branch (2233:5): [True: 142k, False: 5.88M]
  ------------------
 2234|  1.38M|    case TX_32X64:
  ------------------
  |  Branch (2234:5): [True: 10.2k, False: 6.01M]
  ------------------
 2235|  1.42M|    case TX_64X32:
  ------------------
  |  Branch (2235:5): [True: 36.7k, False: 5.99M]
  ------------------
 2236|  1.43M|    case TX_16X64:
  ------------------
  |  Branch (2236:5): [True: 15.7k, False: 6.01M]
  ------------------
 2237|  1.57M|    case TX_64X16:
  ------------------
  |  Branch (2237:5): [True: 140k, False: 5.88M]
  ------------------
 2238|  1.57M|    default:
  ------------------
  |  Branch (2238:5): [True: 0, False: 6.02M]
  ------------------
 2239|  1.57M|      lowbd_inv_txfm2d_add_universe_avx2(input, output, stride, tx_type,
 2240|  1.57M|                                         tx_size, eob);
 2241|  1.57M|      break;
 2242|  6.02M|  }
 2243|  6.02M|}
av1_inv_txfm_add_avx2:
 2246|  6.94M|                           const TxfmParam *txfm_param) {
 2247|  6.94M|  const TX_TYPE tx_type = txfm_param->tx_type;
 2248|  6.94M|  if (!txfm_param->lossless) {
  ------------------
  |  Branch (2248:7): [True: 6.02M, False: 913k]
  ------------------
 2249|  6.02M|    av1_lowbd_inv_txfm2d_add_avx2(dqcoeff, dst, stride, tx_type,
 2250|  6.02M|                                  txfm_param->tx_size, txfm_param->eob);
 2251|  6.02M|  } else {
 2252|   913k|    av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
 2253|   913k|  }
 2254|  6.94M|}
av1_inv_txfm_avx2.c:lowbd_inv_txfm2d_add_8x8_avx2:
 2145|   965k|                                          TX_SIZE tx_size, int eob) {
 2146|   965k|  switch (tx_type) {
 2147|  77.9k|    case IDTX:
  ------------------
  |  Branch (2147:5): [True: 77.9k, False: 887k]
  ------------------
 2148|  77.9k|      av1_lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size);
 2149|       |
 2150|  77.9k|      break;
 2151|  20.5k|    case V_DCT:
  ------------------
  |  Branch (2151:5): [True: 20.5k, False: 944k]
  ------------------
 2152|  26.6k|    case V_ADST:
  ------------------
  |  Branch (2152:5): [True: 6.10k, False: 959k]
  ------------------
 2153|  31.4k|    case V_FLIPADST:
  ------------------
  |  Branch (2153:5): [True: 4.82k, False: 960k]
  ------------------
 2154|  31.4k|      av1_lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type,
 2155|  31.4k|                                                tx_size, eob);
 2156|  31.4k|      break;
 2157|  58.3k|    case H_DCT:
  ------------------
  |  Branch (2157:5): [True: 58.3k, False: 907k]
  ------------------
 2158|  63.7k|    case H_ADST:
  ------------------
  |  Branch (2158:5): [True: 5.42k, False: 960k]
  ------------------
 2159|  67.6k|    case H_FLIPADST:
  ------------------
  |  Branch (2159:5): [True: 3.91k, False: 961k]
  ------------------
 2160|  67.6k|      av1_lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type,
 2161|  67.6k|                                                tx_size, eob);
 2162|  67.6k|      break;
 2163|   788k|    default:
  ------------------
  |  Branch (2163:5): [True: 788k, False: 177k]
  ------------------
 2164|   788k|      lowbd_inv_txfm2d_8x8_no_identity_avx2(input, output, stride, tx_type,
 2165|   788k|                                            tx_size, eob);
 2166|   965k|  }
 2167|   965k|}
av1_inv_txfm_avx2.c:lowbd_inv_txfm2d_8x8_no_identity_avx2:
 2112|   788k|    TX_SIZE tx_size, int eob) {
 2113|   788k|  __m128i buf1[8];
 2114|   788k|  const int input_stride = 8;
 2115|   788k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 2116|   788k|  assert(hitx_1d_tab[tx_type] < 2);
 2117|   788k|  assert(vitx_1d_tab[tx_type] < 2);
 2118|   788k|  const transform_1d_ssse3 row_txfm =
 2119|   788k|      lowbd_txfm_all_1d_zeros_8x8_arr[hitx_1d_tab[tx_type]][eob != 1];
 2120|   788k|  const transform_1d_ssse3 col_txfm =
 2121|   788k|      lowbd_txfm_all_1d_zeros_8x8_arr[vitx_1d_tab[tx_type]][eob != 1];
 2122|       |
 2123|   788k|  assert(col_txfm != NULL);
 2124|   788k|  assert(row_txfm != NULL);
 2125|   788k|  int ud_flip, lr_flip;
 2126|   788k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 2127|       |
 2128|   788k|  __m128i buf0[8];
 2129|   788k|  __m128i *buf0_cur = buf0;
 2130|   788k|  load_buffer_avx2(input, input_stride, buf0_cur);
 2131|   788k|  row_txfm(buf0, buf0);
 2132|       |
 2133|   788k|  assert(shift[0] < 0);
 2134|   788k|  __m128i *_buf1 = buf1;
 2135|   788k|  round_and_transpose_avx2(buf0, _buf1, shift[0], &lr_flip);
 2136|   788k|  assert(shift[1] < 0);
 2137|   788k|  col_txfm(buf1, buf1);
 2138|   788k|  round_shift_lowbd_write_buffer_avx2(buf1, shift[1], output, stride, ud_flip);
 2139|   788k|}
av1_inv_txfm_avx2.c:load_buffer_avx2:
 1896|   788k|                                    __m128i *out) {
 1897|   788k|  const __m256i a = _mm256_load_si256((const __m256i *)in);
 1898|   788k|  const __m256i b = _mm256_load_si256((const __m256i *)(in + stride * 1));
 1899|   788k|  const __m256i c = _mm256_load_si256((const __m256i *)(in + stride * 2));
 1900|   788k|  const __m256i d = _mm256_load_si256((const __m256i *)(in + stride * 3));
 1901|   788k|  const __m256i e = _mm256_load_si256((const __m256i *)(in + stride * 4));
 1902|   788k|  const __m256i f = _mm256_load_si256((const __m256i *)(in + stride * 5));
 1903|   788k|  const __m256i g = _mm256_load_si256((const __m256i *)(in + stride * 6));
 1904|   788k|  const __m256i h = _mm256_load_si256((const __m256i *)(in + stride * 7));
 1905|       |
 1906|       |  // a0 a1 a2 a3 b0 b1 b2 b3 a4 a5 a6 a7 b4 b5 b6 b7
 1907|   788k|  const __m256i ab_16bit = _mm256_packs_epi32(a, b);
 1908|       |  // c0 c1 c2 c3 d0 d1 d2 d3 c4 c5 c6 c7 d4 d5 d6 d7
 1909|   788k|  const __m256i cd_16bit = _mm256_packs_epi32(c, d);
 1910|       |  // e0 e1 e2 e3 f0 f1 f2 f3 e4 e5 e6 e7 f4 f5 f6 f7
 1911|   788k|  const __m256i ef_16bit = _mm256_packs_epi32(e, f);
 1912|       |  // g0 g1 g2 g3 h0 h1 h2 h3 g4 g5 g6 g7 h4 h5 h6 h7
 1913|   788k|  const __m256i gh_16bit = _mm256_packs_epi32(g, h);
 1914|       |
 1915|       |  // a0 a1 a2 a3 a4 a5 a6 a7 b0 b1 b2 b3 b4 b5 b6 b7
 1916|   788k|  const __m256i ab = _mm256_permute4x64_epi64(ab_16bit, 0xd8);
 1917|       |  // c0 c1 c2 c3 c4 c5 c6 c7 d0 d1 d2 d3 d4 d5 d6 d7
 1918|   788k|  const __m256i cd = _mm256_permute4x64_epi64(cd_16bit, 0xd8);
 1919|       |  // e0 e1 e2 e3 e4 e5 e6 e7 f0 f1 f2 f3 f4 f5 f6 f7
 1920|   788k|  const __m256i ef = _mm256_permute4x64_epi64(ef_16bit, 0xd8);
 1921|       |  // g0 g1 g2 g3 g4 g5 g6 g7 h0 h1 h2 h3 h4 h5 h6 h7
 1922|   788k|  const __m256i gh = _mm256_permute4x64_epi64(gh_16bit, 0xd8);
 1923|       |
 1924|   788k|  out[0] = _mm256_castsi256_si128(ab);
 1925|   788k|  out[1] = _mm256_extractf128_si256(ab, 1);
 1926|   788k|  out[2] = _mm256_castsi256_si128(cd);
 1927|   788k|  out[3] = _mm256_extractf128_si256(cd, 1);
 1928|   788k|  out[4] = _mm256_castsi256_si128(ef);
 1929|   788k|  out[5] = _mm256_extractf128_si256(ef, 1);
 1930|   788k|  out[6] = _mm256_castsi256_si128(gh);
 1931|   788k|  out[7] = _mm256_extractf128_si256(gh, 1);
 1932|   788k|}
av1_inv_txfm_avx2.c:round_and_transpose_avx2:
 1936|   788k|                                            int *lr_flip) {
 1937|   788k|  __m256i buf_temp[4];
 1938|   788k|  const __m256i scale = _mm256_set1_epi16(1 << (15 + bit));
 1939|   788k|  int j = *lr_flip ? 7 : 0;
  ------------------
  |  Branch (1939:11): [True: 50.4k, False: 737k]
  ------------------
 1940|   788k|  const int step = *lr_flip ? -1 : 1;
  ------------------
  |  Branch (1940:20): [True: 50.4k, False: 737k]
  ------------------
 1941|       |
 1942|       |  // 70 71 72 73 74 75 76 77 | 30 31 32 33 34 35 36 37
 1943|   788k|  buf_temp[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]),
 1944|   788k|                                        in[j + 4 * step], 1);
 1945|   788k|  j += step;
 1946|       |  // 60 61 62 63 64 65 66 67 | 20 21 22 23 24 25 26 27
 1947|   788k|  buf_temp[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]),
 1948|   788k|                                        in[j + 4 * step], 1);
 1949|   788k|  j += step;
 1950|       |  // 50 51 52 53 54 55 56 57 | 10 11 12 13 14 15 16 17
 1951|   788k|  buf_temp[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]),
 1952|   788k|                                        in[j + 4 * step], 1);
 1953|   788k|  j += step;
 1954|       |  // 40 41 42 43 44 45 46 47 | 00 01 02 03 04 05 06 07
 1955|   788k|  buf_temp[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]),
 1956|   788k|                                        in[j + 4 * step], 1);
 1957|       |
 1958|       |  // 70 71 72 73 74 75 76 77 | 30 31 32 33 34 35 36 37
 1959|   788k|  buf_temp[0] = _mm256_mulhrs_epi16(buf_temp[0], scale);
 1960|       |  // 60 61 62 63 64 65 66 67 | 20 21 22 23 24 25 26 27
 1961|   788k|  buf_temp[1] = _mm256_mulhrs_epi16(buf_temp[1], scale);
 1962|       |  // 50 51 52 53 54 55 56 57 | 10 11 12 13 14 15 16 17
 1963|   788k|  buf_temp[2] = _mm256_mulhrs_epi16(buf_temp[2], scale);
 1964|       |  // 40 41 42 43 44 45 46 47 | 00 01 02 03 04 05 06 07
 1965|   788k|  buf_temp[3] = _mm256_mulhrs_epi16(buf_temp[3], scale);
 1966|       |
 1967|       |  // 70 60 71 61 72 62 73 63 | 30 20 31 21 32 22 33 23
 1968|   788k|  const __m256i unpcklo0 = _mm256_unpacklo_epi16(buf_temp[0], buf_temp[1]);
 1969|       |  // 74 64 75 65 76 66 77 67 | 34 24 35 25 36 26 37 27
 1970|   788k|  const __m256i unpckhi0 = _mm256_unpackhi_epi16(buf_temp[0], buf_temp[1]);
 1971|       |  // 50 40 51 41 52 42 53 43 | 10 00 11 01 12 02 13 03
 1972|   788k|  const __m256i unpcklo1 = _mm256_unpacklo_epi16(buf_temp[2], buf_temp[3]);
 1973|       |  // 54 44 55 45 56 46 57 47 | 14 04 15 05 16 06 17 07
 1974|   788k|  const __m256i unpckhi1 = _mm256_unpackhi_epi16(buf_temp[2], buf_temp[3]);
 1975|       |
 1976|       |  // 70 60 50 40 71 61 51 41 | 30 20 10 00 31 21 11 01
 1977|   788k|  const __m256i unpcklo00 = _mm256_unpacklo_epi32(unpcklo0, unpcklo1);
 1978|       |  // 72 62 52 42 73 63 53 43 | 32 22 12 02 33 23 13 03
 1979|   788k|  const __m256i unpckhi00 = _mm256_unpackhi_epi32(unpcklo0, unpcklo1);
 1980|       |  // 74 64 54 44 75 65 55 45 | 34 24 14 04 35 25 15 05
 1981|   788k|  const __m256i unpcklo01 = _mm256_unpacklo_epi32(unpckhi0, unpckhi1);
 1982|       |  // 76 66 56 46 77 67 57 47 | 36 26 16 06 37 27 17 07
 1983|   788k|  const __m256i unpckhi01 = _mm256_unpackhi_epi32(unpckhi0, unpckhi1);
 1984|       |
 1985|       |  // 70 60 50 40 30 20 10 00 | 71 61 51 41 31 21 11 01
 1986|   788k|  const __m256i reg_00 = _mm256_permute4x64_epi64(unpcklo00, 0xd8);
 1987|       |  // 72 62 52 42 32 22 12 02 | 73 63 53 43 33 23 13 03
 1988|   788k|  const __m256i reg_01 = _mm256_permute4x64_epi64(unpckhi00, 0xd8);
 1989|       |  // 74 64 54 44 34 24 14 04 | 75 65 55 45 35 25 15 05
 1990|   788k|  const __m256i reg_10 = _mm256_permute4x64_epi64(unpcklo01, 0xd8);
 1991|       |  // 76 66 56 46 36 26 16 06 | 77 67 57 47 37 27 17 07
 1992|   788k|  const __m256i reg_11 = _mm256_permute4x64_epi64(unpckhi01, 0xd8);
 1993|       |
 1994|       |  // 70 60 50 40 30 20 10 00
 1995|   788k|  out[0] = _mm256_castsi256_si128(reg_00);
 1996|       |  // 71 61 51 41 31 21 11 01
 1997|   788k|  out[1] = _mm256_extracti128_si256(reg_00, 1);
 1998|       |  // 72 62 52 42 32 22 12 02
 1999|   788k|  out[2] = _mm256_castsi256_si128(reg_01);
 2000|       |  // 73 63 53 43 33 23 13 03
 2001|   788k|  out[3] = _mm256_extracti128_si256(reg_01, 1);
 2002|       |  // 74 64 54 44 34 24 14 04
 2003|   788k|  out[4] = _mm256_castsi256_si128(reg_10);
 2004|       |  // 75 65 55 45 35 25 15 05
 2005|   788k|  out[5] = _mm256_extracti128_si256(reg_10, 1);
 2006|       |  // 76 66 56 46 36 26 16 06
 2007|   788k|  out[6] = _mm256_castsi256_si128(reg_11);
 2008|       |  // 77 67 57 47 37 27 17 07
 2009|   788k|  out[7] = _mm256_extracti128_si256(reg_11, 1);
 2010|   788k|}
av1_inv_txfm_avx2.c:round_shift_lowbd_write_buffer_avx2:
 2014|   788k|                                                       int stride, int flipud) {
 2015|   788k|  __m256i in_256[4], v_256[4];
 2016|   788k|  int j = flipud ? 7 : 0;
  ------------------
  |  Branch (2016:11): [True: 52.2k, False: 736k]
  ------------------
 2017|   788k|  const int step = flipud ? -1 : 1;
  ------------------
  |  Branch (2017:20): [True: 52.2k, False: 736k]
  ------------------
 2018|   788k|  const __m256i scale = _mm256_set1_epi16(1 << (15 + bit));
 2019|   788k|  const __m256i zero = _mm256_setzero_si256();
 2020|       |  // in[0], in[1]
 2021|   788k|  in_256[0] =
 2022|   788k|      _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1);
 2023|   788k|  j += 2 * step;
 2024|       |  // in[2], in[3]
 2025|   788k|  in_256[1] =
 2026|   788k|      _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1);
 2027|   788k|  j += 2 * step;
 2028|       |  // in[4], in[5]
 2029|   788k|  in_256[2] =
 2030|   788k|      _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1);
 2031|   788k|  j += 2 * step;
 2032|       |  // in[6], in[7]
 2033|   788k|  in_256[3] =
 2034|   788k|      _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1);
 2035|       |
 2036|       |  // i00 i01 i02 i03 i04 i05 i06 i07 i10 i11 i12 i13 i14 i15 i16 i17
 2037|   788k|  in_256[0] = _mm256_mulhrs_epi16(in_256[0], scale);
 2038|       |  // i20 i21 i22 i23 i24 i25 i26 i27 i30 i31 i32 i33 i34 i35 i36 i37
 2039|   788k|  in_256[1] = _mm256_mulhrs_epi16(in_256[1], scale);
 2040|       |  // i40 i41 i42 i43 i44 i45 i46 i47 i50 i51 i52 i53 i54 i55 i56 i57
 2041|   788k|  in_256[2] = _mm256_mulhrs_epi16(in_256[2], scale);
 2042|       |  // i60 i61 i62 i63 i64 i65 i66 i67 i70 i71 i72 i73 i74 i75 i76 i77
 2043|   788k|  in_256[3] = _mm256_mulhrs_epi16(in_256[3], scale);
 2044|       |
 2045|   788k|  const __m128i v0 = _mm_loadl_epi64((__m128i const *)(output));
 2046|   788k|  const __m128i v1 = _mm_loadl_epi64((__m128i const *)(output + stride));
 2047|   788k|  const __m128i v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride));
 2048|   788k|  const __m128i v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride));
 2049|   788k|  const __m128i v4 = _mm_loadl_epi64((__m128i const *)(output + 4 * stride));
 2050|   788k|  const __m128i v5 = _mm_loadl_epi64((__m128i const *)(output + 5 * stride));
 2051|   788k|  const __m128i v6 = _mm_loadl_epi64((__m128i const *)(output + 6 * stride));
 2052|   788k|  const __m128i v7 = _mm_loadl_epi64((__m128i const *)(output + 7 * stride));
 2053|       |
 2054|   788k|  v_256[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(v0), v1, 1);
 2055|   788k|  v_256[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(v2), v3, 1);
 2056|   788k|  v_256[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(v4), v5, 1);
 2057|   788k|  v_256[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(v6), v7, 1);
 2058|       |
 2059|   788k|  const __m256i unpcklo0 = _mm256_unpacklo_epi8(v_256[0], zero);
 2060|   788k|  const __m256i unpcklo1 = _mm256_unpacklo_epi8(v_256[1], zero);
 2061|   788k|  const __m256i unpcklo2 = _mm256_unpacklo_epi8(v_256[2], zero);
 2062|   788k|  const __m256i unpcklo3 = _mm256_unpacklo_epi8(v_256[3], zero);
 2063|       |  // 00 01 10 11
 2064|   788k|  const __m256i x0 = _mm256_adds_epi16(in_256[0], unpcklo0);
 2065|       |  // 20 21 30 31
 2066|   788k|  const __m256i x1 = _mm256_adds_epi16(in_256[1], unpcklo1);
 2067|       |  // 40 41 50 51
 2068|   788k|  const __m256i x2 = _mm256_adds_epi16(in_256[2], unpcklo2);
 2069|       |  // 60 61 70 71
 2070|   788k|  const __m256i x3 = _mm256_adds_epi16(in_256[3], unpcklo3);
 2071|       |
 2072|       |  // 00 01 20 21 10 11 30 31
 2073|   788k|  const __m256i res_0123 = _mm256_packus_epi16(x0, x1);
 2074|       |  // 40 41 60 61 50 51 70 71
 2075|   788k|  const __m256i res_4567 = _mm256_packus_epi16(x2, x3);
 2076|       |
 2077|       |  // 00 01 20 21
 2078|   788k|  const __m128i res_02 = _mm256_castsi256_si128(res_0123);
 2079|       |  // 10 11 30 31
 2080|   788k|  const __m128i res_13 = _mm256_extracti128_si256(res_0123, 1);
 2081|       |  // 40 41 60 61
 2082|   788k|  const __m128i res_46 = _mm256_castsi256_si128(res_4567);
 2083|       |  // 50 51 70 71
 2084|   788k|  const __m128i res_57 = _mm256_extracti128_si256(res_4567, 1);
 2085|       |
 2086|       |  // 00 01
 2087|   788k|  _mm_storel_epi64((__m128i *)(output), res_02);
 2088|       |  // 10 11
 2089|   788k|  _mm_storel_epi64((__m128i *)(output + stride), res_13);
 2090|       |  // 20 21
 2091|   788k|  _mm_storel_epi64((__m128i *)(output + 2 * stride),
 2092|   788k|                   _mm_unpackhi_epi64(res_02, res_02));
 2093|       |  // 30 31
 2094|   788k|  _mm_storel_epi64((__m128i *)(output + 3 * stride),
 2095|   788k|                   _mm_unpackhi_epi64(res_13, res_13));
 2096|       |  // 40 41
 2097|   788k|  _mm_storel_epi64((__m128i *)(output + 4 * stride), res_46);
 2098|       |  // 50 51
 2099|   788k|  _mm_storel_epi64((__m128i *)(output + 5 * stride), res_57);
 2100|       |  // 60 61
 2101|   788k|  _mm_storel_epi64((__m128i *)(output + 6 * stride),
 2102|   788k|                   _mm_unpackhi_epi64(res_46, res_46));
 2103|       |  // 70 71
 2104|   788k|  _mm_storel_epi64((__m128i *)(output + 7 * stride),
 2105|   788k|                   _mm_unpackhi_epi64(res_57, res_57));
 2106|   788k|}
av1_inv_txfm_avx2.c:lowbd_inv_txfm2d_add_universe_avx2:
 2172|  1.57M|    TX_SIZE tx_size, int eob) {
 2173|  1.57M|  (void)eob;
 2174|  1.57M|  switch (tx_type) {
 2175|  1.13M|    case DCT_DCT:
  ------------------
  |  Branch (2175:5): [True: 1.13M, False: 443k]
  ------------------
 2176|  1.21M|    case ADST_DCT:   // ADST in vertical, DCT in horizontal
  ------------------
  |  Branch (2176:5): [True: 85.2k, False: 1.49M]
  ------------------
 2177|  1.39M|    case DCT_ADST:   // DCT  in vertical, ADST in horizontal
  ------------------
  |  Branch (2177:5): [True: 176k, False: 1.40M]
  ------------------
 2178|  1.48M|    case ADST_ADST:  // ADST in both directions
  ------------------
  |  Branch (2178:5): [True: 88.4k, False: 1.48M]
  ------------------
 2179|  1.49M|    case FLIPADST_DCT:
  ------------------
  |  Branch (2179:5): [True: 9.16k, False: 1.56M]
  ------------------
 2180|  1.49M|    case DCT_FLIPADST:
  ------------------
  |  Branch (2180:5): [True: 5.70k, False: 1.57M]
  ------------------
 2181|  1.50M|    case FLIPADST_FLIPADST:
  ------------------
  |  Branch (2181:5): [True: 5.29k, False: 1.57M]
  ------------------
 2182|  1.50M|    case ADST_FLIPADST:
  ------------------
  |  Branch (2182:5): [True: 5.99k, False: 1.57M]
  ------------------
 2183|  1.51M|    case FLIPADST_ADST:
  ------------------
  |  Branch (2183:5): [True: 8.23k, False: 1.56M]
  ------------------
 2184|  1.51M|      lowbd_inv_txfm2d_add_no_identity_avx2(input, output, stride, tx_type,
 2185|  1.51M|                                            tx_size, eob);
 2186|  1.51M|      break;
 2187|  35.9k|    case IDTX:
  ------------------
  |  Branch (2187:5): [True: 35.9k, False: 1.54M]
  ------------------
 2188|  35.9k|      lowbd_inv_txfm2d_add_idtx_avx2(input, output, stride, tx_size, eob);
 2189|  35.9k|      break;
 2190|  5.30k|    case V_DCT:
  ------------------
  |  Branch (2190:5): [True: 5.30k, False: 1.57M]
  ------------------
 2191|  5.30k|    case V_ADST:
  ------------------
  |  Branch (2191:5): [True: 0, False: 1.57M]
  ------------------
 2192|  5.30k|    case V_FLIPADST:
  ------------------
  |  Branch (2192:5): [True: 0, False: 1.57M]
  ------------------
 2193|  5.30k|      lowbd_inv_txfm2d_add_h_identity_avx2(input, output, stride, tx_type,
 2194|  5.30k|                                           tx_size, eob);
 2195|  5.30k|      break;
 2196|  18.3k|    case H_DCT:
  ------------------
  |  Branch (2196:5): [True: 18.3k, False: 1.55M]
  ------------------
 2197|  18.3k|    case H_ADST:
  ------------------
  |  Branch (2197:5): [True: 0, False: 1.57M]
  ------------------
 2198|  18.3k|    case H_FLIPADST:
  ------------------
  |  Branch (2198:5): [True: 0, False: 1.57M]
  ------------------
 2199|  18.3k|      lowbd_inv_txfm2d_add_v_identity_avx2(input, output, stride, tx_type,
 2200|  18.3k|                                           tx_size, eob);
 2201|  18.3k|      break;
 2202|      0|    default:
  ------------------
  |  Branch (2202:5): [True: 0, False: 1.57M]
  ------------------
 2203|      0|      av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
 2204|      0|                                     eob);
 2205|      0|      break;
 2206|  1.57M|  }
 2207|  1.57M|}
av1_inv_txfm_avx2.c:lowbd_inv_txfm2d_add_no_identity_avx2:
 1634|  1.51M|    TX_SIZE tx_size, int eob) {
 1635|  1.51M|  __m256i buf1[64 * 16];
 1636|  1.51M|  int eobx, eoby;
 1637|  1.51M|  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
 1638|  1.51M|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 1639|  1.51M|  const int txw_idx = get_txw_idx(tx_size);
 1640|  1.51M|  const int txh_idx = get_txh_idx(tx_size);
 1641|  1.51M|  const int txfm_size_col = tx_size_wide[tx_size];
 1642|  1.51M|  const int txfm_size_row = tx_size_high[tx_size];
 1643|  1.51M|  const int buf_size_w_div16 = txfm_size_col >> 4;
 1644|  1.51M|  const int buf_size_nonzero_w = ((eobx + 16) >> 4) << 4;
 1645|  1.51M|  const int buf_size_nonzero_h_div16 = (eoby + 16) >> 4;
 1646|  1.51M|  const int input_stride = AOMMIN(32, txfm_size_row);
  ------------------
  |  |   34|  1.51M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 101k, False: 1.41M]
  |  |  ------------------
  ------------------
 1647|  1.51M|  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
 1648|       |
 1649|  1.51M|  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
 1650|  1.51M|  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
 1651|  1.51M|  const transform_1d_avx2 row_txfm =
 1652|  1.51M|      lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
 1653|  1.51M|  const transform_1d_avx2 col_txfm =
 1654|  1.51M|      lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
 1655|       |
 1656|  1.51M|  assert(col_txfm != NULL);
 1657|  1.51M|  assert(row_txfm != NULL);
 1658|  1.51M|  int ud_flip, lr_flip;
 1659|  1.51M|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 1660|  1.51M|  const __m256i scale0 = _mm256_set1_epi16(1 << (15 + shift[0]));
 1661|  3.08M|  for (int i = 0; i < buf_size_nonzero_h_div16; i++) {
  ------------------
  |  Branch (1661:19): [True: 1.56M, False: 1.51M]
  ------------------
 1662|  1.56M|    __m256i buf0[64];
 1663|  1.56M|    load_buffer_32bit_to_16bit_w16_avx2(input + 16 * i, input_stride, buf0,
 1664|  1.56M|                                        buf_size_nonzero_w);
 1665|  1.56M|    if (rect_type == 1 || rect_type == -1) {
  ------------------
  |  Branch (1665:9): [True: 175k, False: 1.39M]
  |  Branch (1665:27): [True: 85.1k, False: 1.30M]
  ------------------
 1666|   260k|      round_shift_avx2(buf0, buf0, buf_size_nonzero_w);  // rect special code
 1667|   260k|    }
 1668|  1.56M|    row_txfm(buf0, buf0);
 1669|  47.7M|    for (int j = 0; j < txfm_size_col; ++j) {
  ------------------
  |  Branch (1669:21): [True: 46.1M, False: 1.56M]
  ------------------
 1670|  46.1M|      buf0[j] = _mm256_mulhrs_epi16(buf0[j], scale0);
 1671|  46.1M|    }
 1672|       |
 1673|  1.56M|    __m256i *buf1_cur = buf1 + (i << 4);
 1674|  1.56M|    if (lr_flip) {
  ------------------
  |  Branch (1674:9): [True: 16.9k, False: 1.55M]
  ------------------
 1675|  33.9k|      for (int j = 0; j < buf_size_w_div16; ++j) {
  ------------------
  |  Branch (1675:23): [True: 16.9k, False: 16.9k]
  ------------------
 1676|  16.9k|        __m256i temp[16];
 1677|  16.9k|        flip_buf_avx2(buf0 + 16 * j, temp, 16);
 1678|  16.9k|        int offset = txfm_size_row * (buf_size_w_div16 - 1 - j);
 1679|  16.9k|        transpose_16bit_16x16_avx2(temp, buf1_cur + offset);
 1680|  16.9k|      }
 1681|  1.55M|    } else {
 1682|  4.41M|      for (int j = 0; j < buf_size_w_div16; ++j) {
  ------------------
  |  Branch (1682:23): [True: 2.86M, False: 1.55M]
  ------------------
 1683|  2.86M|        transpose_16bit_16x16_avx2(buf0 + 16 * j, buf1_cur + txfm_size_row * j);
 1684|  2.86M|      }
 1685|  1.55M|    }
 1686|  1.56M|  }
 1687|  1.51M|  const __m256i scale1 = _mm256_set1_epi16(1 << (15 + shift[1]));
 1688|  4.27M|  for (int i = 0; i < buf_size_w_div16; i++) {
  ------------------
  |  Branch (1688:19): [True: 2.75M, False: 1.51M]
  ------------------
 1689|  2.75M|    __m256i *buf1_cur = buf1 + i * txfm_size_row;
 1690|  2.75M|    col_txfm(buf1_cur, buf1_cur);
 1691|  77.4M|    for (int j = 0; j < txfm_size_row; ++j) {
  ------------------
  |  Branch (1691:21): [True: 74.7M, False: 2.75M]
  ------------------
 1692|  74.7M|      buf1_cur[j] = _mm256_mulhrs_epi16(buf1_cur[j], scale1);
 1693|  74.7M|    }
 1694|  2.75M|  }
 1695|  4.27M|  for (int i = 0; i < buf_size_w_div16; i++) {
  ------------------
  |  Branch (1695:19): [True: 2.76M, False: 1.51M]
  ------------------
 1696|  2.76M|    lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i,
 1697|  2.76M|                                 stride, ud_flip, txfm_size_row);
 1698|  2.76M|  }
 1699|  1.51M|}
av1_inv_txfm_avx2.c:idct16_low1_avx2:
  192|   611k|static void idct16_low1_avx2(const __m256i *input, __m256i *output) {
  193|   611k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   611k|#define INV_COS_BIT 12
  ------------------
  194|       |
  195|       |  // stage 1
  196|   611k|  __m256i x1[2];
  197|   611k|  x1[0] = input[0];
  198|       |
  199|       |  // stage 2
  200|       |  // stage 3
  201|       |  // stage 4
  202|   611k|  btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
  ------------------
  |  |   30|   611k|  do {                                             \
  |  |   31|   611k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   611k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   611k|    const __m256i _in = in;                        \
  |  |   34|   611k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   611k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   611k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  203|       |
  204|       |  // stage 5
  205|       |  // stage 6
  206|   611k|  output[0] = x1[0];
  207|   611k|  output[1] = x1[1];
  208|   611k|  output[2] = x1[1];
  209|   611k|  output[3] = x1[0];
  210|   611k|  output[4] = x1[0];
  211|   611k|  output[5] = x1[1];
  212|   611k|  output[6] = x1[1];
  213|   611k|  output[7] = x1[0];
  214|   611k|  output[8] = x1[0];
  215|   611k|  output[9] = x1[1];
  216|   611k|  output[10] = x1[1];
  217|   611k|  output[11] = x1[0];
  218|   611k|  output[12] = x1[0];
  219|   611k|  output[13] = x1[1];
  220|   611k|  output[14] = x1[1];
  221|   611k|  output[15] = x1[0];
  222|   611k|}
av1_inv_txfm_avx2.c:idct16_low8_avx2:
  144|   833k|static void idct16_low8_avx2(const __m256i *input, __m256i *output) {
  145|   833k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   833k|#define INV_COS_BIT 12
  ------------------
  146|   833k|  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|   833k|#define INV_COS_BIT 12
  ------------------
  147|       |
  148|   833k|  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
  149|   833k|  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
  150|   833k|  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
  151|       |
  152|       |  // stage 1
  153|   833k|  __m256i x1[16];
  154|   833k|  x1[0] = input[0];
  155|   833k|  x1[2] = input[4];
  156|   833k|  x1[4] = input[2];
  157|   833k|  x1[6] = input[6];
  158|   833k|  x1[8] = input[1];
  159|   833k|  x1[10] = input[5];
  160|   833k|  x1[12] = input[3];
  161|   833k|  x1[14] = input[7];
  162|       |
  163|       |  // stage 2
  164|   833k|  btf_16_w16_0_avx2(cospi[60], cospi[4], x1[8], x1[8], x1[15]);
  ------------------
  |  |   30|   833k|  do {                                             \
  |  |   31|   833k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   833k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   833k|    const __m256i _in = in;                        \
  |  |   34|   833k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   833k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   833k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  165|   833k|  btf_16_w16_0_avx2(-cospi[36], cospi[28], x1[14], x1[9], x1[14]);
  ------------------
  |  |   30|   833k|  do {                                             \
  |  |   31|   833k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   833k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   833k|    const __m256i _in = in;                        \
  |  |   34|   833k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   833k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   833k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  166|   833k|  btf_16_w16_0_avx2(cospi[44], cospi[20], x1[10], x1[10], x1[13]);
  ------------------
  |  |   30|   833k|  do {                                             \
  |  |   31|   833k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   833k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   833k|    const __m256i _in = in;                        \
  |  |   34|   833k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   833k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   833k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  167|   833k|  btf_16_w16_0_avx2(-cospi[52], cospi[12], x1[12], x1[11], x1[12]);
  ------------------
  |  |   30|   833k|  do {                                             \
  |  |   31|   833k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   833k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   833k|    const __m256i _in = in;                        \
  |  |   34|   833k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   833k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   833k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  168|       |
  169|       |  // stage 3
  170|   833k|  btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]);
  ------------------
  |  |   30|   833k|  do {                                             \
  |  |   31|   833k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   833k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   833k|    const __m256i _in = in;                        \
  |  |   34|   833k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   833k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   833k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  171|   833k|  btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]);
  ------------------
  |  |   30|   833k|  do {                                             \
  |  |   31|   833k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   833k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   833k|    const __m256i _in = in;                        \
  |  |   34|   833k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   833k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   833k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  172|   833k|  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
  173|   833k|  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
  174|   833k|  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
  175|   833k|  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
  176|       |
  177|       |  // stage 4
  178|   833k|  btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
  ------------------
  |  |   30|   833k|  do {                                             \
  |  |   31|   833k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   833k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   833k|    const __m256i _in = in;                        \
  |  |   34|   833k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   833k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   833k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  179|   833k|  btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]);
  ------------------
  |  |   30|   833k|  do {                                             \
  |  |   31|   833k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   833k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   833k|    const __m256i _in = in;                        \
  |  |   34|   833k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   833k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   833k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  180|   833k|  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
  181|   833k|  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
  182|   833k|  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r,
  183|   833k|                  INV_COS_BIT);
  ------------------
  |  |   43|   833k|#define INV_COS_BIT 12
  ------------------
  184|   833k|  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r,
  185|   833k|                  INV_COS_BIT);
  ------------------
  |  |   43|   833k|#define INV_COS_BIT 12
  ------------------
  186|       |
  187|   833k|  idct16_stage5_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   833k|#define INV_COS_BIT 12
  ------------------
  188|   833k|  idct16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   833k|#define INV_COS_BIT 12
  ------------------
  189|   833k|  idct16_stage7_avx2(output, x1);
  190|   833k|}
av1_inv_txfm_avx2.c:idct16_stage5_avx2:
   28|  1.23M|                                      const __m256i _r, int8_t cos_bit) {
   29|  1.23M|  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
   30|  1.23M|  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
   31|  1.23M|  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
   32|  1.23M|  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
   33|  1.23M|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
   34|       |
   35|  1.23M|  btf_16_adds_subs_avx2(&x1[8], &x1[11]);
   36|  1.23M|  btf_16_adds_subs_avx2(&x1[9], &x1[10]);
   37|  1.23M|  btf_16_adds_subs_avx2(&x1[15], &x1[12]);
   38|  1.23M|  btf_16_adds_subs_avx2(&x1[14], &x1[13]);
   39|  1.23M|}
av1_inv_txfm_avx2.c:idct16_stage6_avx2:
   42|  1.23M|                                      const __m256i _r, int8_t cos_bit) {
   43|  1.23M|  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
   44|  1.23M|  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
   45|  1.23M|  btf_16_adds_subs_avx2(&x[0], &x[7]);
   46|  1.23M|  btf_16_adds_subs_avx2(&x[1], &x[6]);
   47|  1.23M|  btf_16_adds_subs_avx2(&x[2], &x[5]);
   48|  1.23M|  btf_16_adds_subs_avx2(&x[3], &x[4]);
   49|  1.23M|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
   50|  1.23M|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
   51|  1.23M|}
av1_inv_txfm_avx2.c:idct16_stage7_avx2:
   53|  1.23M|static inline void idct16_stage7_avx2(__m256i *output, __m256i *x1) {
   54|  1.23M|  btf_16_adds_subs_out_avx2(&output[0], &output[15], x1[0], x1[15]);
   55|  1.23M|  btf_16_adds_subs_out_avx2(&output[1], &output[14], x1[1], x1[14]);
   56|  1.23M|  btf_16_adds_subs_out_avx2(&output[2], &output[13], x1[2], x1[13]);
   57|  1.23M|  btf_16_adds_subs_out_avx2(&output[3], &output[12], x1[3], x1[12]);
   58|  1.23M|  btf_16_adds_subs_out_avx2(&output[4], &output[11], x1[4], x1[11]);
   59|  1.23M|  btf_16_adds_subs_out_avx2(&output[5], &output[10], x1[5], x1[10]);
   60|  1.23M|  btf_16_adds_subs_out_avx2(&output[6], &output[9], x1[6], x1[9]);
   61|  1.23M|  btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]);
   62|  1.23M|}
av1_inv_txfm_avx2.c:idct16_avx2:
   64|   397k|static void idct16_avx2(const __m256i *input, __m256i *output) {
   65|   397k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   397k|#define INV_COS_BIT 12
  ------------------
   66|   397k|  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|   397k|#define INV_COS_BIT 12
  ------------------
   67|       |
   68|   397k|  __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
   69|   397k|  __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
   70|   397k|  __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
   71|   397k|  __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
   72|   397k|  __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
   73|   397k|  __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
   74|   397k|  __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
   75|   397k|  __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
   76|   397k|  __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
   77|   397k|  __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
   78|   397k|  __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
   79|   397k|  __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
   80|   397k|  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
   81|   397k|  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
   82|   397k|  __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
   83|   397k|  __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
   84|   397k|  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
   85|   397k|  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
   86|   397k|  __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
   87|       |
   88|       |  // stage 1
   89|   397k|  __m256i x1[16];
   90|   397k|  x1[0] = input[0];
   91|   397k|  x1[1] = input[8];
   92|   397k|  x1[2] = input[4];
   93|   397k|  x1[3] = input[12];
   94|   397k|  x1[4] = input[2];
   95|   397k|  x1[5] = input[10];
   96|   397k|  x1[6] = input[6];
   97|   397k|  x1[7] = input[14];
   98|   397k|  x1[8] = input[1];
   99|   397k|  x1[9] = input[9];
  100|   397k|  x1[10] = input[5];
  101|   397k|  x1[11] = input[13];
  102|   397k|  x1[12] = input[3];
  103|   397k|  x1[13] = input[11];
  104|   397k|  x1[14] = input[7];
  105|   397k|  x1[15] = input[15];
  106|       |
  107|       |  // stage 2
  108|   397k|  btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r,
  109|   397k|                  INV_COS_BIT);
  ------------------
  |  |   43|   397k|#define INV_COS_BIT 12
  ------------------
  110|   397k|  btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r,
  111|   397k|                  INV_COS_BIT);
  ------------------
  |  |   43|   397k|#define INV_COS_BIT 12
  ------------------
  112|   397k|  btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r,
  113|   397k|                  INV_COS_BIT);
  ------------------
  |  |   43|   397k|#define INV_COS_BIT 12
  ------------------
  114|   397k|  btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r,
  115|   397k|                  INV_COS_BIT);
  ------------------
  |  |   43|   397k|#define INV_COS_BIT 12
  ------------------
  116|       |
  117|       |  // stage 3
  118|   397k|  btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r,
  119|   397k|                  INV_COS_BIT);
  ------------------
  |  |   43|   397k|#define INV_COS_BIT 12
  ------------------
  120|   397k|  btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r,
  121|   397k|                  INV_COS_BIT);
  ------------------
  |  |   43|   397k|#define INV_COS_BIT 12
  ------------------
  122|   397k|  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
  123|   397k|  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
  124|   397k|  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
  125|   397k|  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
  126|       |
  127|       |  // stage 4
  128|   397k|  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r,
  129|   397k|                  INV_COS_BIT);
  ------------------
  |  |   43|   397k|#define INV_COS_BIT 12
  ------------------
  130|   397k|  btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r,
  131|   397k|                  INV_COS_BIT);
  ------------------
  |  |   43|   397k|#define INV_COS_BIT 12
  ------------------
  132|   397k|  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
  133|   397k|  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
  134|   397k|  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r,
  135|   397k|                  INV_COS_BIT);
  ------------------
  |  |   43|   397k|#define INV_COS_BIT 12
  ------------------
  136|   397k|  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r,
  137|   397k|                  INV_COS_BIT);
  ------------------
  |  |   43|   397k|#define INV_COS_BIT 12
  ------------------
  138|       |
  139|   397k|  idct16_stage5_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   397k|#define INV_COS_BIT 12
  ------------------
  140|   397k|  idct16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   397k|#define INV_COS_BIT 12
  ------------------
  141|   397k|  idct16_stage7_avx2(output, x1);
  142|   397k|}
av1_inv_txfm_avx2.c:iadst16_low1_avx2:
  414|   106k|static void iadst16_low1_avx2(const __m256i *input, __m256i *output) {
  415|   106k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   106k|#define INV_COS_BIT 12
  ------------------
  416|   106k|  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|   106k|#define INV_COS_BIT 12
  ------------------
  417|       |
  418|   106k|  const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
  419|   106k|  const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
  420|   106k|  const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
  421|   106k|  const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
  422|       |
  423|       |  // stage 1
  424|   106k|  __m256i x1[16];
  425|   106k|  x1[1] = input[0];
  426|       |
  427|       |  // stage 2
  428|   106k|  btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
  ------------------
  |  |   30|   106k|  do {                                             \
  |  |   31|   106k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   106k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   106k|    const __m256i _in = in;                        \
  |  |   34|   106k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   106k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   106k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  429|       |
  430|       |  // stage 3
  431|   106k|  x1[8] = x1[0];
  432|   106k|  x1[9] = x1[1];
  433|       |
  434|       |  // stage 4
  435|   106k|  btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r,
  436|   106k|                  INV_COS_BIT);
  ------------------
  |  |   43|   106k|#define INV_COS_BIT 12
  ------------------
  437|       |
  438|       |  // stage 5
  439|   106k|  x1[4] = x1[0];
  440|   106k|  x1[5] = x1[1];
  441|       |
  442|   106k|  x1[12] = x1[8];
  443|   106k|  x1[13] = x1[9];
  444|       |
  445|       |  // stage 6
  446|   106k|  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r,
  447|   106k|                  INV_COS_BIT);
  ------------------
  |  |   43|   106k|#define INV_COS_BIT 12
  ------------------
  448|   106k|  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r,
  449|   106k|                  INV_COS_BIT);
  ------------------
  |  |   43|   106k|#define INV_COS_BIT 12
  ------------------
  450|       |
  451|       |  // stage 7
  452|   106k|  x1[2] = x1[0];
  453|   106k|  x1[3] = x1[1];
  454|   106k|  x1[6] = x1[4];
  455|   106k|  x1[7] = x1[5];
  456|   106k|  x1[10] = x1[8];
  457|   106k|  x1[11] = x1[9];
  458|   106k|  x1[14] = x1[12];
  459|   106k|  x1[15] = x1[13];
  460|       |
  461|   106k|  iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   106k|#define INV_COS_BIT 12
  ------------------
  462|   106k|  iadst16_stage9_avx2(output, x1);
  463|   106k|}
av1_inv_txfm_avx2.c:iadst16_stage8_avx2:
  283|   492k|                                       const __m256i _r, int8_t cos_bit) {
  284|   492k|  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
  285|   492k|  const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
  286|   492k|  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
  287|   492k|  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
  288|   492k|  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
  289|   492k|  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
  290|   492k|}
av1_inv_txfm_avx2.c:iadst16_stage9_avx2:
  292|   492k|static inline void iadst16_stage9_avx2(__m256i *output, __m256i *x1) {
  293|   492k|  const __m256i __zero = _mm256_setzero_si256();
  294|   492k|  output[0] = x1[0];
  295|   492k|  output[1] = _mm256_subs_epi16(__zero, x1[8]);
  296|   492k|  output[2] = x1[12];
  297|   492k|  output[3] = _mm256_subs_epi16(__zero, x1[4]);
  298|   492k|  output[4] = x1[6];
  299|   492k|  output[5] = _mm256_subs_epi16(__zero, x1[14]);
  300|   492k|  output[6] = x1[10];
  301|   492k|  output[7] = _mm256_subs_epi16(__zero, x1[2]);
  302|   492k|  output[8] = x1[3];
  303|   492k|  output[9] = _mm256_subs_epi16(__zero, x1[11]);
  304|   492k|  output[10] = x1[15];
  305|   492k|  output[11] = _mm256_subs_epi16(__zero, x1[7]);
  306|   492k|  output[12] = x1[5];
  307|   492k|  output[13] = _mm256_subs_epi16(__zero, x1[13]);
  308|   492k|  output[14] = x1[9];
  309|   492k|  output[15] = _mm256_subs_epi16(__zero, x1[1]);
  310|   492k|}
av1_inv_txfm_avx2.c:iadst16_low8_avx2:
  380|   302k|static void iadst16_low8_avx2(const __m256i *input, __m256i *output) {
  381|   302k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   302k|#define INV_COS_BIT 12
  ------------------
  382|   302k|  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|   302k|#define INV_COS_BIT 12
  ------------------
  383|       |
  384|       |  // stage 1
  385|   302k|  __m256i x1[16];
  386|   302k|  x1[1] = input[0];
  387|   302k|  x1[3] = input[2];
  388|   302k|  x1[5] = input[4];
  389|   302k|  x1[7] = input[6];
  390|   302k|  x1[8] = input[7];
  391|   302k|  x1[10] = input[5];
  392|   302k|  x1[12] = input[3];
  393|   302k|  x1[14] = input[1];
  394|       |
  395|       |  // stage 2
  396|   302k|  btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
  ------------------
  |  |   30|   302k|  do {                                             \
  |  |   31|   302k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   302k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   302k|    const __m256i _in = in;                        \
  |  |   34|   302k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   302k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   302k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  397|   302k|  btf_16_w16_0_avx2(cospi[54], -cospi[10], x1[3], x1[2], x1[3]);
  ------------------
  |  |   30|   302k|  do {                                             \
  |  |   31|   302k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   302k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   302k|    const __m256i _in = in;                        \
  |  |   34|   302k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   302k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   302k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  398|   302k|  btf_16_w16_0_avx2(cospi[46], -cospi[18], x1[5], x1[4], x1[5]);
  ------------------
  |  |   30|   302k|  do {                                             \
  |  |   31|   302k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   302k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   302k|    const __m256i _in = in;                        \
  |  |   34|   302k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   302k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   302k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  399|   302k|  btf_16_w16_0_avx2(cospi[38], -cospi[26], x1[7], x1[6], x1[7]);
  ------------------
  |  |   30|   302k|  do {                                             \
  |  |   31|   302k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   302k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   302k|    const __m256i _in = in;                        \
  |  |   34|   302k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   302k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   302k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  400|   302k|  btf_16_w16_0_avx2(cospi[34], cospi[30], x1[8], x1[8], x1[9]);
  ------------------
  |  |   30|   302k|  do {                                             \
  |  |   31|   302k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   302k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   302k|    const __m256i _in = in;                        \
  |  |   34|   302k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   302k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   302k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  401|   302k|  btf_16_w16_0_avx2(cospi[42], cospi[22], x1[10], x1[10], x1[11]);
  ------------------
  |  |   30|   302k|  do {                                             \
  |  |   31|   302k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   302k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   302k|    const __m256i _in = in;                        \
  |  |   34|   302k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   302k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   302k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  402|   302k|  btf_16_w16_0_avx2(cospi[50], cospi[14], x1[12], x1[12], x1[13]);
  ------------------
  |  |   30|   302k|  do {                                             \
  |  |   31|   302k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   302k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   302k|    const __m256i _in = in;                        \
  |  |   34|   302k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   302k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   302k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  403|   302k|  btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]);
  ------------------
  |  |   30|   302k|  do {                                             \
  |  |   31|   302k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   302k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   302k|    const __m256i _in = in;                        \
  |  |   34|   302k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   302k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   302k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  404|       |
  405|   302k|  iadst16_stage3_avx2(x1);
  406|   302k|  iadst16_stage4_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   302k|#define INV_COS_BIT 12
  ------------------
  407|   302k|  iadst16_stage5_avx2(x1);
  408|   302k|  iadst16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   302k|#define INV_COS_BIT 12
  ------------------
  409|   302k|  iadst16_stage7_avx2(x1);
  410|   302k|  iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   302k|#define INV_COS_BIT 12
  ------------------
  411|   302k|  iadst16_stage9_avx2(output, x1);
  412|   302k|}
av1_inv_txfm_avx2.c:iadst16_stage3_avx2:
  224|   385k|static inline void iadst16_stage3_avx2(__m256i *x) {
  225|   385k|  btf_16_adds_subs_avx2(&x[0], &x[8]);
  226|   385k|  btf_16_adds_subs_avx2(&x[1], &x[9]);
  227|   385k|  btf_16_adds_subs_avx2(&x[2], &x[10]);
  228|   385k|  btf_16_adds_subs_avx2(&x[3], &x[11]);
  229|   385k|  btf_16_adds_subs_avx2(&x[4], &x[12]);
  230|   385k|  btf_16_adds_subs_avx2(&x[5], &x[13]);
  231|   385k|  btf_16_adds_subs_avx2(&x[6], &x[14]);
  232|   385k|  btf_16_adds_subs_avx2(&x[7], &x[15]);
  233|   385k|}
av1_inv_txfm_avx2.c:iadst16_stage4_avx2:
  236|   385k|                                       const __m256i _r, int8_t cos_bit) {
  237|   385k|  const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
  238|   385k|  const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
  239|   385k|  const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
  240|   385k|  const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
  241|   385k|  const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
  242|   385k|  const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
  243|   385k|  btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x[8], &x[9], _r, cos_bit);
  244|   385k|  btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x[10], &x[11], _r, cos_bit);
  245|   385k|  btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x[12], &x[13], _r, cos_bit);
  246|   385k|  btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x[14], &x[15], _r, cos_bit);
  247|   385k|}
av1_inv_txfm_avx2.c:iadst16_stage5_avx2:
  249|   385k|static inline void iadst16_stage5_avx2(__m256i *x) {
  250|   385k|  btf_16_adds_subs_avx2(&x[0], &x[4]);
  251|   385k|  btf_16_adds_subs_avx2(&x[1], &x[5]);
  252|   385k|  btf_16_adds_subs_avx2(&x[2], &x[6]);
  253|   385k|  btf_16_adds_subs_avx2(&x[3], &x[7]);
  254|   385k|  btf_16_adds_subs_avx2(&x[8], &x[12]);
  255|   385k|  btf_16_adds_subs_avx2(&x[9], &x[13]);
  256|   385k|  btf_16_adds_subs_avx2(&x[10], &x[14]);
  257|   385k|  btf_16_adds_subs_avx2(&x[11], &x[15]);
  258|   385k|}
av1_inv_txfm_avx2.c:iadst16_stage6_avx2:
  261|   385k|                                       const __m256i _r, int8_t cos_bit) {
  262|   385k|  const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
  263|   385k|  const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
  264|   385k|  const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
  265|   385k|  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[4], &x[5], _r, cos_bit);
  266|   385k|  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[6], &x[7], _r, cos_bit);
  267|   385k|  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[12], &x[13], _r, cos_bit);
  268|   385k|  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[14], &x[15], _r, cos_bit);
  269|   385k|}
av1_inv_txfm_avx2.c:iadst16_stage7_avx2:
  271|   385k|static inline void iadst16_stage7_avx2(__m256i *x) {
  272|   385k|  btf_16_adds_subs_avx2(&x[0], &x[2]);
  273|   385k|  btf_16_adds_subs_avx2(&x[1], &x[3]);
  274|   385k|  btf_16_adds_subs_avx2(&x[4], &x[6]);
  275|   385k|  btf_16_adds_subs_avx2(&x[5], &x[7]);
  276|   385k|  btf_16_adds_subs_avx2(&x[8], &x[10]);
  277|   385k|  btf_16_adds_subs_avx2(&x[9], &x[11]);
  278|   385k|  btf_16_adds_subs_avx2(&x[12], &x[14]);
  279|   385k|  btf_16_adds_subs_avx2(&x[13], &x[15]);
  280|   385k|}
av1_inv_txfm_avx2.c:iadst16_avx2:
  312|  83.3k|static void iadst16_avx2(const __m256i *input, __m256i *output) {
  313|  83.3k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  83.3k|#define INV_COS_BIT 12
  ------------------
  314|       |
  315|  83.3k|  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  83.3k|#define INV_COS_BIT 12
  ------------------
  316|       |
  317|  83.3k|  __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
  318|  83.3k|  __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
  319|  83.3k|  __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
  320|  83.3k|  __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
  321|  83.3k|  __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
  322|  83.3k|  __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
  323|  83.3k|  __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
  324|  83.3k|  __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
  325|  83.3k|  __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
  326|  83.3k|  __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
  327|  83.3k|  __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
  328|  83.3k|  __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
  329|  83.3k|  __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
  330|  83.3k|  __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
  331|  83.3k|  __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
  332|  83.3k|  __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
  333|       |
  334|       |  // stage 1
  335|  83.3k|  __m256i x1[16];
  336|  83.3k|  x1[0] = input[15];
  337|  83.3k|  x1[1] = input[0];
  338|  83.3k|  x1[2] = input[13];
  339|  83.3k|  x1[3] = input[2];
  340|  83.3k|  x1[4] = input[11];
  341|  83.3k|  x1[5] = input[4];
  342|  83.3k|  x1[6] = input[9];
  343|  83.3k|  x1[7] = input[6];
  344|  83.3k|  x1[8] = input[7];
  345|  83.3k|  x1[9] = input[8];
  346|  83.3k|  x1[10] = input[5];
  347|  83.3k|  x1[11] = input[10];
  348|  83.3k|  x1[12] = input[3];
  349|  83.3k|  x1[13] = input[12];
  350|  83.3k|  x1[14] = input[1];
  351|  83.3k|  x1[15] = input[14];
  352|       |
  353|       |  // stage 2
  354|  83.3k|  btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r,
  355|  83.3k|                  INV_COS_BIT);
  ------------------
  |  |   43|  83.3k|#define INV_COS_BIT 12
  ------------------
  356|  83.3k|  btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r,
  357|  83.3k|                  INV_COS_BIT);
  ------------------
  |  |   43|  83.3k|#define INV_COS_BIT 12
  ------------------
  358|  83.3k|  btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r,
  359|  83.3k|                  INV_COS_BIT);
  ------------------
  |  |   43|  83.3k|#define INV_COS_BIT 12
  ------------------
  360|  83.3k|  btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r,
  361|  83.3k|                  INV_COS_BIT);
  ------------------
  |  |   43|  83.3k|#define INV_COS_BIT 12
  ------------------
  362|  83.3k|  btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r,
  363|  83.3k|                  INV_COS_BIT);
  ------------------
  |  |   43|  83.3k|#define INV_COS_BIT 12
  ------------------
  364|  83.3k|  btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r,
  365|  83.3k|                  INV_COS_BIT);
  ------------------
  |  |   43|  83.3k|#define INV_COS_BIT 12
  ------------------
  366|  83.3k|  btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r,
  367|  83.3k|                  INV_COS_BIT);
  ------------------
  |  |   43|  83.3k|#define INV_COS_BIT 12
  ------------------
  368|  83.3k|  btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r,
  369|  83.3k|                  INV_COS_BIT);
  ------------------
  |  |   43|  83.3k|#define INV_COS_BIT 12
  ------------------
  370|       |
  371|  83.3k|  iadst16_stage3_avx2(x1);
  372|  83.3k|  iadst16_stage4_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  83.3k|#define INV_COS_BIT 12
  ------------------
  373|  83.3k|  iadst16_stage5_avx2(x1);
  374|  83.3k|  iadst16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  83.3k|#define INV_COS_BIT 12
  ------------------
  375|  83.3k|  iadst16_stage7_avx2(x1);
  376|  83.3k|  iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|  83.3k|#define INV_COS_BIT 12
  ------------------
  377|  83.3k|  iadst16_stage9_avx2(output, x1);
  378|  83.3k|}
av1_inv_txfm_avx2.c:idct32_low1_avx2:
  582|   467k|static void idct32_low1_avx2(const __m256i *input, __m256i *output) {
  583|   467k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   467k|#define INV_COS_BIT 12
  ------------------
  584|       |
  585|       |  // stage 1
  586|   467k|  __m256i x[2];
  587|   467k|  x[0] = input[0];
  588|       |
  589|       |  // stage 2
  590|       |  // stage 3
  591|       |  // stage 4
  592|       |  // stage 5
  593|   467k|  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
  ------------------
  |  |   30|   467k|  do {                                             \
  |  |   31|   467k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   467k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   467k|    const __m256i _in = in;                        \
  |  |   34|   467k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   467k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   467k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  594|       |
  595|       |  // stage 6
  596|       |  // stage 7
  597|       |  // stage 8
  598|       |  // stage 9
  599|   467k|  output[0] = x[0];
  600|   467k|  output[31] = x[0];
  601|   467k|  output[1] = x[1];
  602|   467k|  output[30] = x[1];
  603|   467k|  output[2] = x[1];
  604|   467k|  output[29] = x[1];
  605|   467k|  output[3] = x[0];
  606|   467k|  output[28] = x[0];
  607|   467k|  output[4] = x[0];
  608|   467k|  output[27] = x[0];
  609|   467k|  output[5] = x[1];
  610|   467k|  output[26] = x[1];
  611|   467k|  output[6] = x[1];
  612|   467k|  output[25] = x[1];
  613|   467k|  output[7] = x[0];
  614|   467k|  output[24] = x[0];
  615|   467k|  output[8] = x[0];
  616|   467k|  output[23] = x[0];
  617|   467k|  output[9] = x[1];
  618|   467k|  output[22] = x[1];
  619|   467k|  output[10] = x[1];
  620|   467k|  output[21] = x[1];
  621|   467k|  output[11] = x[0];
  622|   467k|  output[20] = x[0];
  623|   467k|  output[12] = x[0];
  624|   467k|  output[19] = x[0];
  625|   467k|  output[13] = x[1];
  626|   467k|  output[18] = x[1];
  627|   467k|  output[14] = x[1];
  628|   467k|  output[17] = x[1];
  629|   467k|  output[15] = x[0];
  630|   467k|  output[16] = x[0];
  631|   467k|}
av1_inv_txfm_avx2.c:idct32_low8_avx2:
  633|   625k|static void idct32_low8_avx2(const __m256i *input, __m256i *output) {
  634|   625k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   625k|#define INV_COS_BIT 12
  ------------------
  635|   625k|  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|   625k|#define INV_COS_BIT 12
  ------------------
  636|       |
  637|       |  // stage 1
  638|   625k|  __m256i x[32];
  639|   625k|  x[0] = input[0];
  640|   625k|  x[4] = input[4];
  641|   625k|  x[8] = input[2];
  642|   625k|  x[12] = input[6];
  643|   625k|  x[16] = input[1];
  644|   625k|  x[20] = input[5];
  645|   625k|  x[24] = input[3];
  646|   625k|  x[28] = input[7];
  647|       |
  648|       |  // stage 2
  649|   625k|  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
  ------------------
  |  |   30|   625k|  do {                                             \
  |  |   31|   625k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   625k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   625k|    const __m256i _in = in;                        \
  |  |   34|   625k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   625k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   625k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  650|   625k|  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
  ------------------
  |  |   30|   625k|  do {                                             \
  |  |   31|   625k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   625k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   625k|    const __m256i _in = in;                        \
  |  |   34|   625k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   625k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   625k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  651|   625k|  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
  ------------------
  |  |   30|   625k|  do {                                             \
  |  |   31|   625k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   625k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   625k|    const __m256i _in = in;                        \
  |  |   34|   625k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   625k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   625k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  652|   625k|  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
  ------------------
  |  |   30|   625k|  do {                                             \
  |  |   31|   625k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   625k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   625k|    const __m256i _in = in;                        \
  |  |   34|   625k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   625k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   625k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  653|       |
  654|       |  // stage 3
  655|   625k|  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
  ------------------
  |  |   30|   625k|  do {                                             \
  |  |   31|   625k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   625k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   625k|    const __m256i _in = in;                        \
  |  |   34|   625k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   625k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   625k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  656|   625k|  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
  ------------------
  |  |   30|   625k|  do {                                             \
  |  |   31|   625k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   625k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   625k|    const __m256i _in = in;                        \
  |  |   34|   625k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   625k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   625k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  657|   625k|  x[17] = x[16];
  658|   625k|  x[18] = x[19];
  659|   625k|  x[21] = x[20];
  660|   625k|  x[22] = x[23];
  661|   625k|  x[25] = x[24];
  662|   625k|  x[26] = x[27];
  663|   625k|  x[29] = x[28];
  664|   625k|  x[30] = x[31];
  665|       |
  666|       |  // stage 4
  667|   625k|  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
  ------------------
  |  |   30|   625k|  do {                                             \
  |  |   31|   625k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   625k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   625k|    const __m256i _in = in;                        \
  |  |   34|   625k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   625k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   625k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  668|   625k|  x[9] = x[8];
  669|   625k|  x[10] = x[11];
  670|   625k|  x[13] = x[12];
  671|   625k|  x[14] = x[15];
  672|   625k|  idct32_high16_stage4_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   625k|#define INV_COS_BIT 12
  ------------------
  673|       |
  674|       |  // stage 5
  675|   625k|  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
  ------------------
  |  |   30|   625k|  do {                                             \
  |  |   31|   625k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   625k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   625k|    const __m256i _in = in;                        \
  |  |   34|   625k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   625k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   625k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  676|   625k|  x[5] = x[4];
  677|   625k|  x[6] = x[7];
  678|   625k|  idct32_high24_stage5_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   625k|#define INV_COS_BIT 12
  ------------------
  679|       |  // stage 6
  680|   625k|  x[3] = x[0];
  681|   625k|  x[2] = x[1];
  682|   625k|  idct32_high28_stage6_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   625k|#define INV_COS_BIT 12
  ------------------
  683|       |
  684|   625k|  idct32_stage7_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   625k|#define INV_COS_BIT 12
  ------------------
  685|   625k|  idct32_stage8_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   625k|#define INV_COS_BIT 12
  ------------------
  686|   625k|  idct32_stage9_avx2(output, x);
  687|   625k|}
av1_inv_txfm_avx2.c:idct32_high16_stage4_avx2:
  477|   941k|                                             const __m256i _r, int8_t cos_bit) {
  478|   941k|  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
  479|   941k|  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
  480|   941k|  const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
  481|   941k|  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
  482|   941k|  const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
  483|   941k|  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
  484|   941k|  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
  485|   941k|  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
  486|   941k|  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
  487|   941k|  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
  488|   941k|}
av1_inv_txfm_avx2.c:idct32_high24_stage5_avx2:
  491|   941k|                                             const __m256i _r, int8_t cos_bit) {
  492|   941k|  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
  493|   941k|  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
  494|   941k|  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
  495|   941k|  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
  496|   941k|  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
  497|   941k|  btf_16_adds_subs_avx2(&x[16], &x[19]);
  498|   941k|  btf_16_adds_subs_avx2(&x[17], &x[18]);
  499|   941k|  btf_16_adds_subs_avx2(&x[23], &x[20]);
  500|   941k|  btf_16_adds_subs_avx2(&x[22], &x[21]);
  501|   941k|  btf_16_adds_subs_avx2(&x[24], &x[27]);
  502|   941k|  btf_16_adds_subs_avx2(&x[25], &x[26]);
  503|   941k|  btf_16_adds_subs_avx2(&x[31], &x[28]);
  504|   941k|  btf_16_adds_subs_avx2(&x[30], &x[29]);
  505|   941k|}
av1_inv_txfm_avx2.c:idct32_high28_stage6_avx2:
  508|   941k|                                             const __m256i _r, int8_t cos_bit) {
  509|   941k|  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
  510|   941k|  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
  511|   941k|  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
  512|   941k|  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
  513|   941k|  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
  514|   941k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
  515|   941k|  btf_16_adds_subs_avx2(&x[8], &x[11]);
  516|   941k|  btf_16_adds_subs_avx2(&x[9], &x[10]);
  517|   941k|  btf_16_adds_subs_avx2(&x[15], &x[12]);
  518|   941k|  btf_16_adds_subs_avx2(&x[14], &x[13]);
  519|   941k|  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
  520|   941k|  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
  521|   941k|  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
  522|   941k|  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
  523|   941k|}
av1_inv_txfm_avx2.c:idct32_stage7_avx2:
  526|   941k|                                      const __m256i _r, int8_t cos_bit) {
  527|   941k|  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
  528|   941k|  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
  529|   941k|  btf_16_adds_subs_avx2(&x[0], &x[7]);
  530|   941k|  btf_16_adds_subs_avx2(&x[1], &x[6]);
  531|   941k|  btf_16_adds_subs_avx2(&x[2], &x[5]);
  532|   941k|  btf_16_adds_subs_avx2(&x[3], &x[4]);
  533|   941k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
  534|   941k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
  535|   941k|  btf_16_adds_subs_avx2(&x[16], &x[23]);
  536|   941k|  btf_16_adds_subs_avx2(&x[17], &x[22]);
  537|   941k|  btf_16_adds_subs_avx2(&x[18], &x[21]);
  538|   941k|  btf_16_adds_subs_avx2(&x[19], &x[20]);
  539|   941k|  btf_16_adds_subs_avx2(&x[31], &x[24]);
  540|   941k|  btf_16_adds_subs_avx2(&x[30], &x[25]);
  541|   941k|  btf_16_adds_subs_avx2(&x[29], &x[26]);
  542|   941k|  btf_16_adds_subs_avx2(&x[28], &x[27]);
  543|   941k|}
av1_inv_txfm_avx2.c:idct32_stage8_avx2:
  546|   941k|                                      const __m256i _r, int8_t cos_bit) {
  547|   941k|  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
  548|   941k|  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
  549|   941k|  btf_16_adds_subs_avx2(&x[0], &x[15]);
  550|   941k|  btf_16_adds_subs_avx2(&x[1], &x[14]);
  551|   941k|  btf_16_adds_subs_avx2(&x[2], &x[13]);
  552|   941k|  btf_16_adds_subs_avx2(&x[3], &x[12]);
  553|   941k|  btf_16_adds_subs_avx2(&x[4], &x[11]);
  554|   941k|  btf_16_adds_subs_avx2(&x[5], &x[10]);
  555|   941k|  btf_16_adds_subs_avx2(&x[6], &x[9]);
  556|   941k|  btf_16_adds_subs_avx2(&x[7], &x[8]);
  557|   941k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
  558|   941k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
  559|   941k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
  560|   941k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
  561|   941k|}
av1_inv_txfm_avx2.c:idct32_stage9_avx2:
  563|   941k|static inline void idct32_stage9_avx2(__m256i *output, __m256i *x) {
  564|   941k|  btf_16_adds_subs_out_avx2(&output[0], &output[31], x[0], x[31]);
  565|   941k|  btf_16_adds_subs_out_avx2(&output[1], &output[30], x[1], x[30]);
  566|   941k|  btf_16_adds_subs_out_avx2(&output[2], &output[29], x[2], x[29]);
  567|   941k|  btf_16_adds_subs_out_avx2(&output[3], &output[28], x[3], x[28]);
  568|   941k|  btf_16_adds_subs_out_avx2(&output[4], &output[27], x[4], x[27]);
  569|   941k|  btf_16_adds_subs_out_avx2(&output[5], &output[26], x[5], x[26]);
  570|   941k|  btf_16_adds_subs_out_avx2(&output[6], &output[25], x[6], x[25]);
  571|   941k|  btf_16_adds_subs_out_avx2(&output[7], &output[24], x[7], x[24]);
  572|   941k|  btf_16_adds_subs_out_avx2(&output[8], &output[23], x[8], x[23]);
  573|   941k|  btf_16_adds_subs_out_avx2(&output[9], &output[22], x[9], x[22]);
  574|   941k|  btf_16_adds_subs_out_avx2(&output[10], &output[21], x[10], x[21]);
  575|   941k|  btf_16_adds_subs_out_avx2(&output[11], &output[20], x[11], x[20]);
  576|   941k|  btf_16_adds_subs_out_avx2(&output[12], &output[19], x[12], x[19]);
  577|   941k|  btf_16_adds_subs_out_avx2(&output[13], &output[18], x[13], x[18]);
  578|   941k|  btf_16_adds_subs_out_avx2(&output[14], &output[17], x[14], x[17]);
  579|   941k|  btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]);
  580|   941k|}
av1_inv_txfm_avx2.c:idct32_low16_avx2:
  689|   181k|static void idct32_low16_avx2(const __m256i *input, __m256i *output) {
  690|   181k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   181k|#define INV_COS_BIT 12
  ------------------
  691|   181k|  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|   181k|#define INV_COS_BIT 12
  ------------------
  692|       |
  693|       |  // stage 1
  694|   181k|  __m256i x[32];
  695|   181k|  x[0] = input[0];
  696|   181k|  x[2] = input[8];
  697|   181k|  x[4] = input[4];
  698|   181k|  x[6] = input[12];
  699|   181k|  x[8] = input[2];
  700|   181k|  x[10] = input[10];
  701|   181k|  x[12] = input[6];
  702|   181k|  x[14] = input[14];
  703|   181k|  x[16] = input[1];
  704|   181k|  x[18] = input[9];
  705|   181k|  x[20] = input[5];
  706|   181k|  x[22] = input[13];
  707|   181k|  x[24] = input[3];
  708|   181k|  x[26] = input[11];
  709|   181k|  x[28] = input[7];
  710|   181k|  x[30] = input[15];
  711|       |
  712|       |  // stage 2
  713|   181k|  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
  ------------------
  |  |   30|   181k|  do {                                             \
  |  |   31|   181k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   181k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   181k|    const __m256i _in = in;                        \
  |  |   34|   181k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   181k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   181k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  714|   181k|  btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
  ------------------
  |  |   30|   181k|  do {                                             \
  |  |   31|   181k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   181k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   181k|    const __m256i _in = in;                        \
  |  |   34|   181k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   181k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   181k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  715|   181k|  btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
  ------------------
  |  |   30|   181k|  do {                                             \
  |  |   31|   181k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   181k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   181k|    const __m256i _in = in;                        \
  |  |   34|   181k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   181k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   181k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  716|   181k|  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
  ------------------
  |  |   30|   181k|  do {                                             \
  |  |   31|   181k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   181k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   181k|    const __m256i _in = in;                        \
  |  |   34|   181k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   181k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   181k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  717|   181k|  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
  ------------------
  |  |   30|   181k|  do {                                             \
  |  |   31|   181k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   181k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   181k|    const __m256i _in = in;                        \
  |  |   34|   181k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   181k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   181k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  718|   181k|  btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
  ------------------
  |  |   30|   181k|  do {                                             \
  |  |   31|   181k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   181k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   181k|    const __m256i _in = in;                        \
  |  |   34|   181k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   181k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   181k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  719|   181k|  btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
  ------------------
  |  |   30|   181k|  do {                                             \
  |  |   31|   181k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   181k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   181k|    const __m256i _in = in;                        \
  |  |   34|   181k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   181k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   181k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  720|   181k|  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
  ------------------
  |  |   30|   181k|  do {                                             \
  |  |   31|   181k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   181k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   181k|    const __m256i _in = in;                        \
  |  |   34|   181k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   181k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   181k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  721|       |
  722|       |  // stage 3
  723|   181k|  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
  ------------------
  |  |   30|   181k|  do {                                             \
  |  |   31|   181k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   181k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   181k|    const __m256i _in = in;                        \
  |  |   34|   181k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   181k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   181k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  724|   181k|  btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
  ------------------
  |  |   30|   181k|  do {                                             \
  |  |   31|   181k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   181k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   181k|    const __m256i _in = in;                        \
  |  |   34|   181k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   181k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   181k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  725|   181k|  btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
  ------------------
  |  |   30|   181k|  do {                                             \
  |  |   31|   181k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   181k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   181k|    const __m256i _in = in;                        \
  |  |   34|   181k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   181k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   181k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  726|   181k|  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
  ------------------
  |  |   30|   181k|  do {                                             \
  |  |   31|   181k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   181k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   181k|    const __m256i _in = in;                        \
  |  |   34|   181k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   181k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   181k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  727|   181k|  idct32_high16_stage3_avx2(x);
  728|       |
  729|       |  // stage 4
  730|   181k|  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
  ------------------
  |  |   30|   181k|  do {                                             \
  |  |   31|   181k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   181k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   181k|    const __m256i _in = in;                        \
  |  |   34|   181k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   181k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   181k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  731|   181k|  btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
  ------------------
  |  |   30|   181k|  do {                                             \
  |  |   31|   181k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   181k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   181k|    const __m256i _in = in;                        \
  |  |   34|   181k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   181k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   181k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  732|   181k|  btf_16_adds_subs_avx2(&x[8], &x[9]);
  733|   181k|  btf_16_adds_subs_avx2(&x[11], &x[10]);
  734|   181k|  btf_16_adds_subs_avx2(&x[12], &x[13]);
  735|   181k|  btf_16_adds_subs_avx2(&x[15], &x[14]);
  736|   181k|  idct32_high16_stage4_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   181k|#define INV_COS_BIT 12
  ------------------
  737|       |
  738|       |  // stage 5
  739|   181k|  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
  ------------------
  |  |   30|   181k|  do {                                             \
  |  |   31|   181k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   181k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   181k|    const __m256i _in = in;                        \
  |  |   34|   181k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   181k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   181k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  740|   181k|  btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
  ------------------
  |  |   30|   181k|  do {                                             \
  |  |   31|   181k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   181k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   181k|    const __m256i _in = in;                        \
  |  |   34|   181k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   181k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   181k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  741|   181k|  btf_16_adds_subs_avx2(&x[4], &x[5]);
  742|   181k|  btf_16_adds_subs_avx2(&x[7], &x[6]);
  743|   181k|  idct32_high24_stage5_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   181k|#define INV_COS_BIT 12
  ------------------
  744|       |
  745|   181k|  btf_16_adds_subs_avx2(&x[0], &x[3]);
  746|   181k|  btf_16_adds_subs_avx2(&x[1], &x[2]);
  747|   181k|  idct32_high28_stage6_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   181k|#define INV_COS_BIT 12
  ------------------
  748|       |
  749|   181k|  idct32_stage7_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   181k|#define INV_COS_BIT 12
  ------------------
  750|   181k|  idct32_stage8_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   181k|#define INV_COS_BIT 12
  ------------------
  751|   181k|  idct32_stage9_avx2(output, x);
  752|   181k|}
av1_inv_txfm_avx2.c:idct32_high16_stage3_avx2:
  465|   315k|static inline void idct32_high16_stage3_avx2(__m256i *x) {
  466|   315k|  btf_16_adds_subs_avx2(&x[16], &x[17]);
  467|   315k|  btf_16_adds_subs_avx2(&x[19], &x[18]);
  468|   315k|  btf_16_adds_subs_avx2(&x[20], &x[21]);
  469|   315k|  btf_16_adds_subs_avx2(&x[23], &x[22]);
  470|   315k|  btf_16_adds_subs_avx2(&x[24], &x[25]);
  471|   315k|  btf_16_adds_subs_avx2(&x[27], &x[26]);
  472|   315k|  btf_16_adds_subs_avx2(&x[28], &x[29]);
  473|   315k|  btf_16_adds_subs_avx2(&x[31], &x[30]);
  474|   315k|}
av1_inv_txfm_avx2.c:idct32_avx2:
  754|   134k|static void idct32_avx2(const __m256i *input, __m256i *output) {
  755|   134k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   134k|#define INV_COS_BIT 12
  ------------------
  756|   134k|  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|   134k|#define INV_COS_BIT 12
  ------------------
  757|       |
  758|   134k|  __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
  759|   134k|  __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
  760|   134k|  __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
  761|   134k|  __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
  762|   134k|  __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
  763|   134k|  __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
  764|   134k|  __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
  765|   134k|  __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
  766|   134k|  __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
  767|   134k|  __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
  768|   134k|  __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
  769|   134k|  __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
  770|   134k|  __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
  771|   134k|  __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
  772|   134k|  __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
  773|   134k|  __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
  774|   134k|  __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
  775|   134k|  __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
  776|   134k|  __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
  777|   134k|  __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
  778|   134k|  __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
  779|   134k|  __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
  780|   134k|  __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
  781|   134k|  __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
  782|   134k|  __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
  783|   134k|  __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
  784|   134k|  __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
  785|   134k|  __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
  786|   134k|  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
  787|   134k|  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
  788|   134k|  __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
  789|   134k|  __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
  790|       |
  791|       |  // stage 1
  792|   134k|  __m256i x1[32];
  793|   134k|  x1[0] = input[0];
  794|   134k|  x1[1] = input[16];
  795|   134k|  x1[2] = input[8];
  796|   134k|  x1[3] = input[24];
  797|   134k|  x1[4] = input[4];
  798|   134k|  x1[5] = input[20];
  799|   134k|  x1[6] = input[12];
  800|   134k|  x1[7] = input[28];
  801|   134k|  x1[8] = input[2];
  802|   134k|  x1[9] = input[18];
  803|   134k|  x1[10] = input[10];
  804|   134k|  x1[11] = input[26];
  805|   134k|  x1[12] = input[6];
  806|   134k|  x1[13] = input[22];
  807|   134k|  x1[14] = input[14];
  808|   134k|  x1[15] = input[30];
  809|   134k|  x1[16] = input[1];
  810|   134k|  x1[17] = input[17];
  811|   134k|  x1[18] = input[9];
  812|   134k|  x1[19] = input[25];
  813|   134k|  x1[20] = input[5];
  814|   134k|  x1[21] = input[21];
  815|   134k|  x1[22] = input[13];
  816|   134k|  x1[23] = input[29];
  817|   134k|  x1[24] = input[3];
  818|   134k|  x1[25] = input[19];
  819|   134k|  x1[26] = input[11];
  820|   134k|  x1[27] = input[27];
  821|   134k|  x1[28] = input[7];
  822|   134k|  x1[29] = input[23];
  823|   134k|  x1[30] = input[15];
  824|   134k|  x1[31] = input[31];
  825|       |
  826|       |  // stage 2
  827|   134k|  btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, &x1[16], &x1[31], _r,
  828|   134k|                  INV_COS_BIT);
  ------------------
  |  |   43|   134k|#define INV_COS_BIT 12
  ------------------
  829|   134k|  btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, &x1[17], &x1[30], _r,
  830|   134k|                  INV_COS_BIT);
  ------------------
  |  |   43|   134k|#define INV_COS_BIT 12
  ------------------
  831|   134k|  btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, &x1[18], &x1[29], _r,
  832|   134k|                  INV_COS_BIT);
  ------------------
  |  |   43|   134k|#define INV_COS_BIT 12
  ------------------
  833|   134k|  btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, &x1[19], &x1[28], _r,
  834|   134k|                  INV_COS_BIT);
  ------------------
  |  |   43|   134k|#define INV_COS_BIT 12
  ------------------
  835|   134k|  btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, &x1[20], &x1[27], _r,
  836|   134k|                  INV_COS_BIT);
  ------------------
  |  |   43|   134k|#define INV_COS_BIT 12
  ------------------
  837|   134k|  btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, &x1[21], &x1[26], _r,
  838|   134k|                  INV_COS_BIT);
  ------------------
  |  |   43|   134k|#define INV_COS_BIT 12
  ------------------
  839|   134k|  btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, &x1[22], &x1[25], _r,
  840|   134k|                  INV_COS_BIT);
  ------------------
  |  |   43|   134k|#define INV_COS_BIT 12
  ------------------
  841|   134k|  btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, &x1[23], &x1[24], _r,
  842|   134k|                  INV_COS_BIT);
  ------------------
  |  |   43|   134k|#define INV_COS_BIT 12
  ------------------
  843|       |
  844|       |  // stage 3
  845|   134k|  btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r,
  846|   134k|                  INV_COS_BIT);
  ------------------
  |  |   43|   134k|#define INV_COS_BIT 12
  ------------------
  847|   134k|  btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r,
  848|   134k|                  INV_COS_BIT);
  ------------------
  |  |   43|   134k|#define INV_COS_BIT 12
  ------------------
  849|   134k|  btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r,
  850|   134k|                  INV_COS_BIT);
  ------------------
  |  |   43|   134k|#define INV_COS_BIT 12
  ------------------
  851|   134k|  btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r,
  852|   134k|                  INV_COS_BIT);
  ------------------
  |  |   43|   134k|#define INV_COS_BIT 12
  ------------------
  853|   134k|  idct32_high16_stage3_avx2(x1);
  854|       |
  855|       |  // stage 4
  856|   134k|  btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r,
  857|   134k|                  INV_COS_BIT);
  ------------------
  |  |   43|   134k|#define INV_COS_BIT 12
  ------------------
  858|   134k|  btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r,
  859|   134k|                  INV_COS_BIT);
  ------------------
  |  |   43|   134k|#define INV_COS_BIT 12
  ------------------
  860|   134k|  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
  861|   134k|  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
  862|   134k|  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
  863|   134k|  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
  864|   134k|  idct32_high16_stage4_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   134k|#define INV_COS_BIT 12
  ------------------
  865|       |
  866|       |  // stage 5
  867|   134k|  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r,
  868|   134k|                  INV_COS_BIT);
  ------------------
  |  |   43|   134k|#define INV_COS_BIT 12
  ------------------
  869|   134k|  btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r,
  870|   134k|                  INV_COS_BIT);
  ------------------
  |  |   43|   134k|#define INV_COS_BIT 12
  ------------------
  871|   134k|  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
  872|   134k|  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
  873|   134k|  idct32_high24_stage5_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   134k|#define INV_COS_BIT 12
  ------------------
  874|       |
  875|       |  // stage 6
  876|   134k|  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
  877|   134k|  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
  878|   134k|  idct32_high28_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   134k|#define INV_COS_BIT 12
  ------------------
  879|       |
  880|   134k|  idct32_stage7_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   134k|#define INV_COS_BIT 12
  ------------------
  881|   134k|  idct32_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   134k|#define INV_COS_BIT 12
  ------------------
  882|   134k|  idct32_stage9_avx2(output, x1);
  883|   134k|}
av1_inv_txfm_avx2.c:idct64_low1_avx2:
 1126|   147k|static void idct64_low1_avx2(const __m256i *input, __m256i *output) {
 1127|   147k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   147k|#define INV_COS_BIT 12
  ------------------
 1128|       |
 1129|       |  // stage 1
 1130|   147k|  __m256i x[32];
 1131|   147k|  x[0] = input[0];
 1132|       |
 1133|       |  // stage 2
 1134|       |  // stage 3
 1135|       |  // stage 4
 1136|       |  // stage 5
 1137|       |  // stage 6
 1138|   147k|  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
  ------------------
  |  |   30|   147k|  do {                                             \
  |  |   31|   147k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   147k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   147k|    const __m256i _in = in;                        \
  |  |   34|   147k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   147k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   147k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1139|       |
 1140|       |  // stage 7
 1141|       |  // stage 8
 1142|       |  // stage 9
 1143|       |  // stage 10
 1144|       |  // stage 11
 1145|   147k|  output[0] = x[0];
 1146|   147k|  output[63] = x[0];
 1147|   147k|  output[1] = x[1];
 1148|   147k|  output[62] = x[1];
 1149|   147k|  output[2] = x[1];
 1150|   147k|  output[61] = x[1];
 1151|   147k|  output[3] = x[0];
 1152|   147k|  output[60] = x[0];
 1153|   147k|  output[4] = x[0];
 1154|   147k|  output[59] = x[0];
 1155|   147k|  output[5] = x[1];
 1156|   147k|  output[58] = x[1];
 1157|   147k|  output[6] = x[1];
 1158|   147k|  output[57] = x[1];
 1159|   147k|  output[7] = x[0];
 1160|   147k|  output[56] = x[0];
 1161|   147k|  output[8] = x[0];
 1162|   147k|  output[55] = x[0];
 1163|   147k|  output[9] = x[1];
 1164|   147k|  output[54] = x[1];
 1165|   147k|  output[10] = x[1];
 1166|   147k|  output[53] = x[1];
 1167|   147k|  output[11] = x[0];
 1168|   147k|  output[52] = x[0];
 1169|   147k|  output[12] = x[0];
 1170|   147k|  output[51] = x[0];
 1171|   147k|  output[13] = x[1];
 1172|   147k|  output[50] = x[1];
 1173|   147k|  output[14] = x[1];
 1174|   147k|  output[49] = x[1];
 1175|   147k|  output[15] = x[0];
 1176|   147k|  output[48] = x[0];
 1177|   147k|  output[16] = x[0];
 1178|   147k|  output[47] = x[0];
 1179|   147k|  output[17] = x[1];
 1180|   147k|  output[46] = x[1];
 1181|   147k|  output[18] = x[1];
 1182|   147k|  output[45] = x[1];
 1183|   147k|  output[19] = x[0];
 1184|   147k|  output[44] = x[0];
 1185|   147k|  output[20] = x[0];
 1186|   147k|  output[43] = x[0];
 1187|   147k|  output[21] = x[1];
 1188|   147k|  output[42] = x[1];
 1189|   147k|  output[22] = x[1];
 1190|   147k|  output[41] = x[1];
 1191|   147k|  output[23] = x[0];
 1192|   147k|  output[40] = x[0];
 1193|   147k|  output[24] = x[0];
 1194|   147k|  output[39] = x[0];
 1195|   147k|  output[25] = x[1];
 1196|   147k|  output[38] = x[1];
 1197|   147k|  output[26] = x[1];
 1198|   147k|  output[37] = x[1];
 1199|   147k|  output[27] = x[0];
 1200|   147k|  output[36] = x[0];
 1201|   147k|  output[28] = x[0];
 1202|   147k|  output[35] = x[0];
 1203|   147k|  output[29] = x[1];
 1204|   147k|  output[34] = x[1];
 1205|   147k|  output[30] = x[1];
 1206|   147k|  output[33] = x[1];
 1207|   147k|  output[31] = x[0];
 1208|   147k|  output[32] = x[0];
 1209|   147k|}
av1_inv_txfm_avx2.c:idct64_low8_avx2:
 1211|   223k|static void idct64_low8_avx2(const __m256i *input, __m256i *output) {
 1212|   223k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   223k|#define INV_COS_BIT 12
  ------------------
 1213|   223k|  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|   223k|#define INV_COS_BIT 12
  ------------------
 1214|   223k|  const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
 1215|   223k|  const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
 1216|   223k|  const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
 1217|   223k|  const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
 1218|   223k|  const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
 1219|   223k|  const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
 1220|   223k|  const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
 1221|   223k|  const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
 1222|   223k|  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
 1223|   223k|  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
 1224|   223k|  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
 1225|   223k|  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
 1226|   223k|  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
 1227|   223k|  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
 1228|   223k|  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
 1229|   223k|  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
 1230|       |
 1231|       |  // stage 1
 1232|   223k|  __m256i x[64];
 1233|   223k|  x[0] = input[0];
 1234|   223k|  x[8] = input[4];
 1235|   223k|  x[16] = input[2];
 1236|   223k|  x[24] = input[6];
 1237|   223k|  x[32] = input[1];
 1238|   223k|  x[40] = input[5];
 1239|   223k|  x[48] = input[3];
 1240|   223k|  x[56] = input[7];
 1241|       |
 1242|       |  // stage 2
 1243|   223k|  btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
  ------------------
  |  |   30|   223k|  do {                                             \
  |  |   31|   223k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   223k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   223k|    const __m256i _in = in;                        \
  |  |   34|   223k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   223k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   223k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1244|   223k|  btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
  ------------------
  |  |   30|   223k|  do {                                             \
  |  |   31|   223k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   223k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   223k|    const __m256i _in = in;                        \
  |  |   34|   223k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   223k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   223k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1245|   223k|  btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
  ------------------
  |  |   30|   223k|  do {                                             \
  |  |   31|   223k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   223k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   223k|    const __m256i _in = in;                        \
  |  |   34|   223k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   223k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   223k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1246|   223k|  btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
  ------------------
  |  |   30|   223k|  do {                                             \
  |  |   31|   223k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   223k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   223k|    const __m256i _in = in;                        \
  |  |   34|   223k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   223k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   223k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1247|       |
 1248|       |  // stage 3
 1249|   223k|  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
  ------------------
  |  |   30|   223k|  do {                                             \
  |  |   31|   223k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   223k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   223k|    const __m256i _in = in;                        \
  |  |   34|   223k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   223k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   223k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1250|   223k|  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
  ------------------
  |  |   30|   223k|  do {                                             \
  |  |   31|   223k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   223k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   223k|    const __m256i _in = in;                        \
  |  |   34|   223k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   223k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   223k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1251|   223k|  x[33] = x[32];
 1252|   223k|  x[38] = x[39];
 1253|   223k|  x[41] = x[40];
 1254|   223k|  x[46] = x[47];
 1255|   223k|  x[49] = x[48];
 1256|   223k|  x[54] = x[55];
 1257|   223k|  x[57] = x[56];
 1258|   223k|  x[62] = x[63];
 1259|       |
 1260|       |  // stage 4
 1261|   223k|  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
  ------------------
  |  |   30|   223k|  do {                                             \
  |  |   31|   223k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   223k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   223k|    const __m256i _in = in;                        \
  |  |   34|   223k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   223k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   223k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1262|   223k|  x[17] = x[16];
 1263|   223k|  x[22] = x[23];
 1264|   223k|  x[25] = x[24];
 1265|   223k|  x[30] = x[31];
 1266|   223k|  btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r,
 1267|   223k|                  INV_COS_BIT);
  ------------------
  |  |   43|   223k|#define INV_COS_BIT 12
  ------------------
 1268|   223k|  btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r,
 1269|   223k|                  INV_COS_BIT);
  ------------------
  |  |   43|   223k|#define INV_COS_BIT 12
  ------------------
 1270|   223k|  btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r,
 1271|   223k|                  INV_COS_BIT);
  ------------------
  |  |   43|   223k|#define INV_COS_BIT 12
  ------------------
 1272|   223k|  btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r,
 1273|   223k|                  INV_COS_BIT);
  ------------------
  |  |   43|   223k|#define INV_COS_BIT 12
  ------------------
 1274|       |
 1275|       |  // stage 5
 1276|   223k|  x[9] = x[8];
 1277|   223k|  x[14] = x[15];
 1278|   223k|  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r,
 1279|   223k|                  INV_COS_BIT);
  ------------------
  |  |   43|   223k|#define INV_COS_BIT 12
  ------------------
 1280|   223k|  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r,
 1281|   223k|                  INV_COS_BIT);
  ------------------
  |  |   43|   223k|#define INV_COS_BIT 12
  ------------------
 1282|   223k|  x[35] = x[32];
 1283|   223k|  x[34] = x[33];
 1284|   223k|  x[36] = x[39];
 1285|   223k|  x[37] = x[38];
 1286|   223k|  x[43] = x[40];
 1287|   223k|  x[42] = x[41];
 1288|   223k|  x[44] = x[47];
 1289|   223k|  x[45] = x[46];
 1290|   223k|  x[51] = x[48];
 1291|   223k|  x[50] = x[49];
 1292|   223k|  x[52] = x[55];
 1293|   223k|  x[53] = x[54];
 1294|   223k|  x[59] = x[56];
 1295|   223k|  x[58] = x[57];
 1296|   223k|  x[60] = x[63];
 1297|   223k|  x[61] = x[62];
 1298|       |
 1299|       |  // stage 6
 1300|   223k|  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
  ------------------
  |  |   30|   223k|  do {                                             \
  |  |   31|   223k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   223k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   223k|    const __m256i _in = in;                        \
  |  |   34|   223k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   223k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   223k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1301|   223k|  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT);
  ------------------
  |  |   43|   223k|#define INV_COS_BIT 12
  ------------------
 1302|   223k|  x[19] = x[16];
 1303|   223k|  x[18] = x[17];
 1304|   223k|  x[20] = x[23];
 1305|   223k|  x[21] = x[22];
 1306|   223k|  x[27] = x[24];
 1307|   223k|  x[26] = x[25];
 1308|   223k|  x[28] = x[31];
 1309|   223k|  x[29] = x[30];
 1310|   223k|  idct64_stage6_high32_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   223k|#define INV_COS_BIT 12
  ------------------
 1311|       |
 1312|       |  // stage 7
 1313|   223k|  x[3] = x[0];
 1314|   223k|  x[2] = x[1];
 1315|   223k|  x[11] = x[8];
 1316|   223k|  x[10] = x[9];
 1317|   223k|  x[12] = x[15];
 1318|   223k|  x[13] = x[14];
 1319|   223k|  idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   223k|#define INV_COS_BIT 12
  ------------------
 1320|       |
 1321|       |  // stage 8
 1322|   223k|  x[7] = x[0];
 1323|   223k|  x[6] = x[1];
 1324|   223k|  x[5] = x[2];
 1325|   223k|  x[4] = x[3];
 1326|   223k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r,
 1327|   223k|                  INV_COS_BIT);
  ------------------
  |  |   43|   223k|#define INV_COS_BIT 12
  ------------------
 1328|   223k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r,
 1329|   223k|                  INV_COS_BIT);
  ------------------
  |  |   43|   223k|#define INV_COS_BIT 12
  ------------------
 1330|   223k|  idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   223k|#define INV_COS_BIT 12
  ------------------
 1331|       |
 1332|   223k|  idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   223k|#define INV_COS_BIT 12
  ------------------
 1333|   223k|  idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   223k|#define INV_COS_BIT 12
  ------------------
 1334|   223k|  idct64_stage11_avx2(output, x);
 1335|   223k|}
av1_inv_txfm_avx2.c:idct64_stage6_high32_avx2:
  942|   460k|                                             const __m256i _r, int8_t cos_bit) {
  943|   460k|  (void)cos_bit;
  944|   460k|  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
  945|   460k|  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
  946|   460k|  const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
  947|   460k|  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
  948|   460k|  const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
  949|   460k|  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
  950|   460k|  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[34], &x[61], _r, cos_bit);
  951|   460k|  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[35], &x[60], _r, cos_bit);
  952|   460k|  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[36], &x[59], _r, cos_bit);
  953|   460k|  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[37], &x[58], _r, cos_bit);
  954|   460k|  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[42], &x[53], _r, cos_bit);
  955|   460k|  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[43], &x[52], _r, cos_bit);
  956|   460k|  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[44], &x[51], _r, cos_bit);
  957|   460k|  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[45], &x[50], _r, cos_bit);
  958|   460k|}
av1_inv_txfm_avx2.c:idct64_stage7_high48_avx2:
  974|   460k|                                             const __m256i _r, int8_t cos_bit) {
  975|   460k|  (void)cos_bit;
  976|   460k|  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
  977|   460k|  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
  978|   460k|  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
  979|   460k|  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
  980|   460k|  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
  981|   460k|  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
  982|   460k|  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
  983|   460k|  btf_16_adds_subs_avx2(&x[32], &x[39]);
  984|   460k|  btf_16_adds_subs_avx2(&x[33], &x[38]);
  985|   460k|  btf_16_adds_subs_avx2(&x[34], &x[37]);
  986|   460k|  btf_16_adds_subs_avx2(&x[35], &x[36]);
  987|   460k|  btf_16_adds_subs_avx2(&x[47], &x[40]);
  988|   460k|  btf_16_adds_subs_avx2(&x[46], &x[41]);
  989|   460k|  btf_16_adds_subs_avx2(&x[45], &x[42]);
  990|   460k|  btf_16_adds_subs_avx2(&x[44], &x[43]);
  991|   460k|  btf_16_adds_subs_avx2(&x[48], &x[55]);
  992|   460k|  btf_16_adds_subs_avx2(&x[49], &x[54]);
  993|   460k|  btf_16_adds_subs_avx2(&x[50], &x[53]);
  994|   460k|  btf_16_adds_subs_avx2(&x[51], &x[52]);
  995|   460k|  btf_16_adds_subs_avx2(&x[63], &x[56]);
  996|   460k|  btf_16_adds_subs_avx2(&x[62], &x[57]);
  997|   460k|  btf_16_adds_subs_avx2(&x[61], &x[58]);
  998|   460k|  btf_16_adds_subs_avx2(&x[60], &x[59]);
  999|   460k|}
av1_inv_txfm_avx2.c:idct64_stage8_high48_avx2:
 1002|   460k|                                             const __m256i _r, int8_t cos_bit) {
 1003|   460k|  (void)cos_bit;
 1004|   460k|  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
 1005|   460k|  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
 1006|   460k|  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
 1007|   460k|  btf_16_adds_subs_avx2(&x[16], &x[23]);
 1008|   460k|  btf_16_adds_subs_avx2(&x[17], &x[22]);
 1009|   460k|  btf_16_adds_subs_avx2(&x[18], &x[21]);
 1010|   460k|  btf_16_adds_subs_avx2(&x[19], &x[20]);
 1011|   460k|  btf_16_adds_subs_avx2(&x[31], &x[24]);
 1012|   460k|  btf_16_adds_subs_avx2(&x[30], &x[25]);
 1013|   460k|  btf_16_adds_subs_avx2(&x[29], &x[26]);
 1014|   460k|  btf_16_adds_subs_avx2(&x[28], &x[27]);
 1015|   460k|  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[36], &x[59], _r, cos_bit);
 1016|   460k|  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[37], &x[58], _r, cos_bit);
 1017|   460k|  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[38], &x[57], _r, cos_bit);
 1018|   460k|  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[39], &x[56], _r, cos_bit);
 1019|   460k|  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[40], &x[55], _r, cos_bit);
 1020|   460k|  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[41], &x[54], _r, cos_bit);
 1021|   460k|  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[42], &x[53], _r, cos_bit);
 1022|   460k|  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[43], &x[52], _r, cos_bit);
 1023|   460k|}
av1_inv_txfm_avx2.c:idct64_stage9_avx2:
 1026|   460k|                                      const __m256i _r, int8_t cos_bit) {
 1027|   460k|  (void)cos_bit;
 1028|   460k|  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
 1029|   460k|  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
 1030|   460k|  btf_16_adds_subs_avx2(&x[0], &x[15]);
 1031|   460k|  btf_16_adds_subs_avx2(&x[1], &x[14]);
 1032|   460k|  btf_16_adds_subs_avx2(&x[2], &x[13]);
 1033|   460k|  btf_16_adds_subs_avx2(&x[3], &x[12]);
 1034|   460k|  btf_16_adds_subs_avx2(&x[4], &x[11]);
 1035|   460k|  btf_16_adds_subs_avx2(&x[5], &x[10]);
 1036|   460k|  btf_16_adds_subs_avx2(&x[6], &x[9]);
 1037|   460k|  btf_16_adds_subs_avx2(&x[7], &x[8]);
 1038|   460k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
 1039|   460k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
 1040|   460k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
 1041|   460k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
 1042|   460k|  btf_16_adds_subs_avx2(&x[32], &x[47]);
 1043|   460k|  btf_16_adds_subs_avx2(&x[33], &x[46]);
 1044|   460k|  btf_16_adds_subs_avx2(&x[34], &x[45]);
 1045|   460k|  btf_16_adds_subs_avx2(&x[35], &x[44]);
 1046|   460k|  btf_16_adds_subs_avx2(&x[36], &x[43]);
 1047|   460k|  btf_16_adds_subs_avx2(&x[37], &x[42]);
 1048|   460k|  btf_16_adds_subs_avx2(&x[38], &x[41]);
 1049|   460k|  btf_16_adds_subs_avx2(&x[39], &x[40]);
 1050|   460k|  btf_16_adds_subs_avx2(&x[63], &x[48]);
 1051|   460k|  btf_16_adds_subs_avx2(&x[62], &x[49]);
 1052|   460k|  btf_16_adds_subs_avx2(&x[61], &x[50]);
 1053|   460k|  btf_16_adds_subs_avx2(&x[60], &x[51]);
 1054|   460k|  btf_16_adds_subs_avx2(&x[59], &x[52]);
 1055|   460k|  btf_16_adds_subs_avx2(&x[58], &x[53]);
 1056|   460k|  btf_16_adds_subs_avx2(&x[57], &x[54]);
 1057|   460k|  btf_16_adds_subs_avx2(&x[56], &x[55]);
 1058|   460k|}
av1_inv_txfm_avx2.c:idct64_stage10_avx2:
 1061|   460k|                                       const __m256i _r, int8_t cos_bit) {
 1062|   460k|  (void)cos_bit;
 1063|   460k|  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
 1064|   460k|  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
 1065|   460k|  btf_16_adds_subs_avx2(&x[0], &x[31]);
 1066|   460k|  btf_16_adds_subs_avx2(&x[1], &x[30]);
 1067|   460k|  btf_16_adds_subs_avx2(&x[2], &x[29]);
 1068|   460k|  btf_16_adds_subs_avx2(&x[3], &x[28]);
 1069|   460k|  btf_16_adds_subs_avx2(&x[4], &x[27]);
 1070|   460k|  btf_16_adds_subs_avx2(&x[5], &x[26]);
 1071|   460k|  btf_16_adds_subs_avx2(&x[6], &x[25]);
 1072|   460k|  btf_16_adds_subs_avx2(&x[7], &x[24]);
 1073|   460k|  btf_16_adds_subs_avx2(&x[8], &x[23]);
 1074|   460k|  btf_16_adds_subs_avx2(&x[9], &x[22]);
 1075|   460k|  btf_16_adds_subs_avx2(&x[10], &x[21]);
 1076|   460k|  btf_16_adds_subs_avx2(&x[11], &x[20]);
 1077|   460k|  btf_16_adds_subs_avx2(&x[12], &x[19]);
 1078|   460k|  btf_16_adds_subs_avx2(&x[13], &x[18]);
 1079|   460k|  btf_16_adds_subs_avx2(&x[14], &x[17]);
 1080|   460k|  btf_16_adds_subs_avx2(&x[15], &x[16]);
 1081|   460k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[40], &x[55], _r, cos_bit);
 1082|   460k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[41], &x[54], _r, cos_bit);
 1083|   460k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[42], &x[53], _r, cos_bit);
 1084|   460k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[43], &x[52], _r, cos_bit);
 1085|   460k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[44], &x[51], _r, cos_bit);
 1086|   460k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[45], &x[50], _r, cos_bit);
 1087|   460k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[46], &x[49], _r, cos_bit);
 1088|   460k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[47], &x[48], _r, cos_bit);
 1089|   460k|}
av1_inv_txfm_avx2.c:idct64_stage11_avx2:
 1091|   460k|static inline void idct64_stage11_avx2(__m256i *output, __m256i *x) {
 1092|   460k|  btf_16_adds_subs_out_avx2(&output[0], &output[63], x[0], x[63]);
 1093|   460k|  btf_16_adds_subs_out_avx2(&output[1], &output[62], x[1], x[62]);
 1094|   460k|  btf_16_adds_subs_out_avx2(&output[2], &output[61], x[2], x[61]);
 1095|   460k|  btf_16_adds_subs_out_avx2(&output[3], &output[60], x[3], x[60]);
 1096|   460k|  btf_16_adds_subs_out_avx2(&output[4], &output[59], x[4], x[59]);
 1097|   460k|  btf_16_adds_subs_out_avx2(&output[5], &output[58], x[5], x[58]);
 1098|   460k|  btf_16_adds_subs_out_avx2(&output[6], &output[57], x[6], x[57]);
 1099|   460k|  btf_16_adds_subs_out_avx2(&output[7], &output[56], x[7], x[56]);
 1100|   460k|  btf_16_adds_subs_out_avx2(&output[8], &output[55], x[8], x[55]);
 1101|   460k|  btf_16_adds_subs_out_avx2(&output[9], &output[54], x[9], x[54]);
 1102|   460k|  btf_16_adds_subs_out_avx2(&output[10], &output[53], x[10], x[53]);
 1103|   460k|  btf_16_adds_subs_out_avx2(&output[11], &output[52], x[11], x[52]);
 1104|   460k|  btf_16_adds_subs_out_avx2(&output[12], &output[51], x[12], x[51]);
 1105|   460k|  btf_16_adds_subs_out_avx2(&output[13], &output[50], x[13], x[50]);
 1106|   460k|  btf_16_adds_subs_out_avx2(&output[14], &output[49], x[14], x[49]);
 1107|   460k|  btf_16_adds_subs_out_avx2(&output[15], &output[48], x[15], x[48]);
 1108|   460k|  btf_16_adds_subs_out_avx2(&output[16], &output[47], x[16], x[47]);
 1109|   460k|  btf_16_adds_subs_out_avx2(&output[17], &output[46], x[17], x[46]);
 1110|   460k|  btf_16_adds_subs_out_avx2(&output[18], &output[45], x[18], x[45]);
 1111|   460k|  btf_16_adds_subs_out_avx2(&output[19], &output[44], x[19], x[44]);
 1112|   460k|  btf_16_adds_subs_out_avx2(&output[20], &output[43], x[20], x[43]);
 1113|   460k|  btf_16_adds_subs_out_avx2(&output[21], &output[42], x[21], x[42]);
 1114|   460k|  btf_16_adds_subs_out_avx2(&output[22], &output[41], x[22], x[41]);
 1115|   460k|  btf_16_adds_subs_out_avx2(&output[23], &output[40], x[23], x[40]);
 1116|   460k|  btf_16_adds_subs_out_avx2(&output[24], &output[39], x[24], x[39]);
 1117|   460k|  btf_16_adds_subs_out_avx2(&output[25], &output[38], x[25], x[38]);
 1118|   460k|  btf_16_adds_subs_out_avx2(&output[26], &output[37], x[26], x[37]);
 1119|   460k|  btf_16_adds_subs_out_avx2(&output[27], &output[36], x[27], x[36]);
 1120|   460k|  btf_16_adds_subs_out_avx2(&output[28], &output[35], x[28], x[35]);
 1121|   460k|  btf_16_adds_subs_out_avx2(&output[29], &output[34], x[29], x[34]);
 1122|   460k|  btf_16_adds_subs_out_avx2(&output[30], &output[33], x[30], x[33]);
 1123|   460k|  btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]);
 1124|   460k|}
av1_inv_txfm_avx2.c:idct64_low16_avx2:
 1337|   136k|static void idct64_low16_avx2(const __m256i *input, __m256i *output) {
 1338|   136k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   136k|#define INV_COS_BIT 12
  ------------------
 1339|   136k|  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|   136k|#define INV_COS_BIT 12
  ------------------
 1340|       |
 1341|   136k|  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
 1342|   136k|  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
 1343|   136k|  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
 1344|   136k|  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
 1345|   136k|  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
 1346|       |
 1347|       |  // stage 1
 1348|   136k|  __m256i x[64];
 1349|   136k|  x[0] = input[0];
 1350|   136k|  x[4] = input[8];
 1351|   136k|  x[8] = input[4];
 1352|   136k|  x[12] = input[12];
 1353|   136k|  x[16] = input[2];
 1354|   136k|  x[20] = input[10];
 1355|   136k|  x[24] = input[6];
 1356|   136k|  x[28] = input[14];
 1357|   136k|  x[32] = input[1];
 1358|   136k|  x[36] = input[9];
 1359|   136k|  x[40] = input[5];
 1360|   136k|  x[44] = input[13];
 1361|   136k|  x[48] = input[3];
 1362|   136k|  x[52] = input[11];
 1363|   136k|  x[56] = input[7];
 1364|   136k|  x[60] = input[15];
 1365|       |
 1366|       |  // stage 2
 1367|   136k|  btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
  ------------------
  |  |   30|   136k|  do {                                             \
  |  |   31|   136k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   136k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   136k|    const __m256i _in = in;                        \
  |  |   34|   136k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   136k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   136k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1368|   136k|  btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
  ------------------
  |  |   30|   136k|  do {                                             \
  |  |   31|   136k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   136k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   136k|    const __m256i _in = in;                        \
  |  |   34|   136k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   136k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   136k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1369|   136k|  btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
  ------------------
  |  |   30|   136k|  do {                                             \
  |  |   31|   136k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   136k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   136k|    const __m256i _in = in;                        \
  |  |   34|   136k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   136k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   136k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1370|   136k|  btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
  ------------------
  |  |   30|   136k|  do {                                             \
  |  |   31|   136k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   136k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   136k|    const __m256i _in = in;                        \
  |  |   34|   136k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   136k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   136k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1371|   136k|  btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
  ------------------
  |  |   30|   136k|  do {                                             \
  |  |   31|   136k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   136k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   136k|    const __m256i _in = in;                        \
  |  |   34|   136k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   136k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   136k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1372|   136k|  btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
  ------------------
  |  |   30|   136k|  do {                                             \
  |  |   31|   136k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   136k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   136k|    const __m256i _in = in;                        \
  |  |   34|   136k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   136k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   136k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1373|   136k|  btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
  ------------------
  |  |   30|   136k|  do {                                             \
  |  |   31|   136k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   136k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   136k|    const __m256i _in = in;                        \
  |  |   34|   136k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   136k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   136k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1374|   136k|  btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
  ------------------
  |  |   30|   136k|  do {                                             \
  |  |   31|   136k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   136k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   136k|    const __m256i _in = in;                        \
  |  |   34|   136k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   136k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   136k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1375|       |
 1376|       |  // stage 3
 1377|   136k|  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
  ------------------
  |  |   30|   136k|  do {                                             \
  |  |   31|   136k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   136k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   136k|    const __m256i _in = in;                        \
  |  |   34|   136k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   136k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   136k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1378|   136k|  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
  ------------------
  |  |   30|   136k|  do {                                             \
  |  |   31|   136k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   136k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   136k|    const __m256i _in = in;                        \
  |  |   34|   136k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   136k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   136k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1379|   136k|  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
  ------------------
  |  |   30|   136k|  do {                                             \
  |  |   31|   136k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   136k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   136k|    const __m256i _in = in;                        \
  |  |   34|   136k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   136k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   136k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1380|   136k|  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
  ------------------
  |  |   30|   136k|  do {                                             \
  |  |   31|   136k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   136k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   136k|    const __m256i _in = in;                        \
  |  |   34|   136k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   136k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   136k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1381|   136k|  x[33] = x[32];
 1382|   136k|  x[34] = x[35];
 1383|   136k|  x[37] = x[36];
 1384|   136k|  x[38] = x[39];
 1385|   136k|  x[41] = x[40];
 1386|   136k|  x[42] = x[43];
 1387|   136k|  x[45] = x[44];
 1388|   136k|  x[46] = x[47];
 1389|   136k|  x[49] = x[48];
 1390|   136k|  x[50] = x[51];
 1391|   136k|  x[53] = x[52];
 1392|   136k|  x[54] = x[55];
 1393|   136k|  x[57] = x[56];
 1394|   136k|  x[58] = x[59];
 1395|   136k|  x[61] = x[60];
 1396|   136k|  x[62] = x[63];
 1397|       |
 1398|       |  // stage 4
 1399|   136k|  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
  ------------------
  |  |   30|   136k|  do {                                             \
  |  |   31|   136k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   136k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   136k|    const __m256i _in = in;                        \
  |  |   34|   136k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   136k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   136k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1400|   136k|  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
  ------------------
  |  |   30|   136k|  do {                                             \
  |  |   31|   136k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   136k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   136k|    const __m256i _in = in;                        \
  |  |   34|   136k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   136k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   136k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1401|   136k|  x[17] = x[16];
 1402|   136k|  x[18] = x[19];
 1403|   136k|  x[21] = x[20];
 1404|   136k|  x[22] = x[23];
 1405|   136k|  x[25] = x[24];
 1406|   136k|  x[26] = x[27];
 1407|   136k|  x[29] = x[28];
 1408|   136k|  x[30] = x[31];
 1409|   136k|  idct64_stage4_high32_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   136k|#define INV_COS_BIT 12
  ------------------
 1410|       |
 1411|       |  // stage 5
 1412|   136k|  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
  ------------------
  |  |   30|   136k|  do {                                             \
  |  |   31|   136k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   136k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   136k|    const __m256i _in = in;                        \
  |  |   34|   136k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   136k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   136k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1413|   136k|  x[9] = x[8];
 1414|   136k|  x[10] = x[11];
 1415|   136k|  x[13] = x[12];
 1416|   136k|  x[14] = x[15];
 1417|   136k|  idct64_stage5_high48_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   136k|#define INV_COS_BIT 12
  ------------------
 1418|       |
 1419|       |  // stage 6
 1420|   136k|  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
  ------------------
  |  |   30|   136k|  do {                                             \
  |  |   31|   136k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   136k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   136k|    const __m256i _in = in;                        \
  |  |   34|   136k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   136k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   136k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1421|   136k|  x[5] = x[4];
 1422|   136k|  x[6] = x[7];
 1423|   136k|  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT);
  ------------------
  |  |   43|   136k|#define INV_COS_BIT 12
  ------------------
 1424|   136k|  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r,
 1425|   136k|                  INV_COS_BIT);
  ------------------
  |  |   43|   136k|#define INV_COS_BIT 12
  ------------------
 1426|   136k|  idct64_stage6_high48_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   136k|#define INV_COS_BIT 12
  ------------------
 1427|       |
 1428|       |  // stage 7
 1429|   136k|  x[3] = x[0];
 1430|   136k|  x[2] = x[1];
 1431|   136k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, INV_COS_BIT);
  ------------------
  |  |   43|   136k|#define INV_COS_BIT 12
  ------------------
 1432|   136k|  btf_16_adds_subs_avx2(&x[8], &x[11]);
 1433|   136k|  btf_16_adds_subs_avx2(&x[9], &x[10]);
 1434|   136k|  btf_16_adds_subs_avx2(&x[15], &x[12]);
 1435|   136k|  btf_16_adds_subs_avx2(&x[14], &x[13]);
 1436|   136k|  idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   136k|#define INV_COS_BIT 12
  ------------------
 1437|       |
 1438|       |  // stage 8
 1439|   136k|  btf_16_adds_subs_avx2(&x[0], &x[7]);
 1440|   136k|  btf_16_adds_subs_avx2(&x[1], &x[6]);
 1441|   136k|  btf_16_adds_subs_avx2(&x[2], &x[5]);
 1442|   136k|  btf_16_adds_subs_avx2(&x[3], &x[4]);
 1443|   136k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r,
 1444|   136k|                  INV_COS_BIT);
  ------------------
  |  |   43|   136k|#define INV_COS_BIT 12
  ------------------
 1445|   136k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r,
 1446|   136k|                  INV_COS_BIT);
  ------------------
  |  |   43|   136k|#define INV_COS_BIT 12
  ------------------
 1447|   136k|  idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   136k|#define INV_COS_BIT 12
  ------------------
 1448|       |
 1449|   136k|  idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   136k|#define INV_COS_BIT 12
  ------------------
 1450|   136k|  idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   136k|#define INV_COS_BIT 12
  ------------------
 1451|   136k|  idct64_stage11_avx2(output, x);
 1452|   136k|}
av1_inv_txfm_avx2.c:idct64_stage4_high32_avx2:
  886|   237k|                                             const __m256i _r, int8_t cos_bit) {
  887|   237k|  (void)cos_bit;
  888|   237k|  const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
  889|   237k|  const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
  890|   237k|  const __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
  891|   237k|  const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
  892|   237k|  const __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
  893|   237k|  const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
  894|   237k|  const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
  895|   237k|  const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
  896|   237k|  const __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
  897|   237k|  const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
  898|   237k|  const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
  899|   237k|  const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
  900|   237k|  btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit);
  901|   237k|  btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x[34], &x[61], _r, cos_bit);
  902|   237k|  btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x[37], &x[58], _r, cos_bit);
  903|   237k|  btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit);
  904|   237k|  btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit);
  905|   237k|  btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x[42], &x[53], _r, cos_bit);
  906|   237k|  btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x[45], &x[50], _r, cos_bit);
  907|   237k|  btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
  908|   237k|}
av1_inv_txfm_avx2.c:idct64_stage5_high48_avx2:
  911|   237k|                                             const __m256i _r, int8_t cos_bit) {
  912|   237k|  (void)cos_bit;
  913|   237k|  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
  914|   237k|  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
  915|   237k|  const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
  916|   237k|  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
  917|   237k|  const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
  918|   237k|  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
  919|   237k|  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
  920|   237k|  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
  921|   237k|  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
  922|   237k|  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
  923|   237k|  btf_16_adds_subs_avx2(&x[32], &x[35]);
  924|   237k|  btf_16_adds_subs_avx2(&x[33], &x[34]);
  925|   237k|  btf_16_adds_subs_avx2(&x[39], &x[36]);
  926|   237k|  btf_16_adds_subs_avx2(&x[38], &x[37]);
  927|   237k|  btf_16_adds_subs_avx2(&x[40], &x[43]);
  928|   237k|  btf_16_adds_subs_avx2(&x[41], &x[42]);
  929|   237k|  btf_16_adds_subs_avx2(&x[47], &x[44]);
  930|   237k|  btf_16_adds_subs_avx2(&x[46], &x[45]);
  931|   237k|  btf_16_adds_subs_avx2(&x[48], &x[51]);
  932|   237k|  btf_16_adds_subs_avx2(&x[49], &x[50]);
  933|   237k|  btf_16_adds_subs_avx2(&x[55], &x[52]);
  934|   237k|  btf_16_adds_subs_avx2(&x[54], &x[53]);
  935|   237k|  btf_16_adds_subs_avx2(&x[56], &x[59]);
  936|   237k|  btf_16_adds_subs_avx2(&x[57], &x[58]);
  937|   237k|  btf_16_adds_subs_avx2(&x[63], &x[60]);
  938|   237k|  btf_16_adds_subs_avx2(&x[62], &x[61]);
  939|   237k|}
av1_inv_txfm_avx2.c:idct64_stage6_high48_avx2:
  961|   237k|                                             const __m256i _r, int8_t cos_bit) {
  962|   237k|  btf_16_adds_subs_avx2(&x[16], &x[19]);
  963|   237k|  btf_16_adds_subs_avx2(&x[17], &x[18]);
  964|   237k|  btf_16_adds_subs_avx2(&x[23], &x[20]);
  965|   237k|  btf_16_adds_subs_avx2(&x[22], &x[21]);
  966|   237k|  btf_16_adds_subs_avx2(&x[24], &x[27]);
  967|   237k|  btf_16_adds_subs_avx2(&x[25], &x[26]);
  968|   237k|  btf_16_adds_subs_avx2(&x[31], &x[28]);
  969|   237k|  btf_16_adds_subs_avx2(&x[30], &x[29]);
  970|   237k|  idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
  971|   237k|}
av1_inv_txfm_avx2.c:idct64_low32_avx2:
 1454|   100k|static void idct64_low32_avx2(const __m256i *input, __m256i *output) {
 1455|   100k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   100k|#define INV_COS_BIT 12
  ------------------
 1456|   100k|  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|   100k|#define INV_COS_BIT 12
  ------------------
 1457|       |
 1458|   100k|  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
 1459|   100k|  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
 1460|   100k|  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
 1461|   100k|  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
 1462|   100k|  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
 1463|       |
 1464|       |  // stage 1
 1465|   100k|  __m256i x[64];
 1466|   100k|  x[0] = input[0];
 1467|   100k|  x[2] = input[16];
 1468|   100k|  x[4] = input[8];
 1469|   100k|  x[6] = input[24];
 1470|   100k|  x[8] = input[4];
 1471|   100k|  x[10] = input[20];
 1472|   100k|  x[12] = input[12];
 1473|   100k|  x[14] = input[28];
 1474|   100k|  x[16] = input[2];
 1475|   100k|  x[18] = input[18];
 1476|   100k|  x[20] = input[10];
 1477|   100k|  x[22] = input[26];
 1478|   100k|  x[24] = input[6];
 1479|   100k|  x[26] = input[22];
 1480|   100k|  x[28] = input[14];
 1481|   100k|  x[30] = input[30];
 1482|   100k|  x[32] = input[1];
 1483|   100k|  x[34] = input[17];
 1484|   100k|  x[36] = input[9];
 1485|   100k|  x[38] = input[25];
 1486|   100k|  x[40] = input[5];
 1487|   100k|  x[42] = input[21];
 1488|   100k|  x[44] = input[13];
 1489|   100k|  x[46] = input[29];
 1490|   100k|  x[48] = input[3];
 1491|   100k|  x[50] = input[19];
 1492|   100k|  x[52] = input[11];
 1493|   100k|  x[54] = input[27];
 1494|   100k|  x[56] = input[7];
 1495|   100k|  x[58] = input[23];
 1496|   100k|  x[60] = input[15];
 1497|   100k|  x[62] = input[31];
 1498|       |
 1499|       |  // stage 2
 1500|   100k|  btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1501|   100k|  btf_16_w16_0_avx2(-cospi[33], cospi[31], x[62], x[33], x[62]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1502|   100k|  btf_16_w16_0_avx2(cospi[47], cospi[17], x[34], x[34], x[61]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1503|   100k|  btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1504|   100k|  btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1505|   100k|  btf_16_w16_0_avx2(-cospi[41], cospi[23], x[58], x[37], x[58]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1506|   100k|  btf_16_w16_0_avx2(cospi[39], cospi[25], x[38], x[38], x[57]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1507|   100k|  btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1508|   100k|  btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1509|   100k|  btf_16_w16_0_avx2(-cospi[37], cospi[27], x[54], x[41], x[54]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1510|   100k|  btf_16_w16_0_avx2(cospi[43], cospi[21], x[42], x[42], x[53]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1511|   100k|  btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1512|   100k|  btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1513|   100k|  btf_16_w16_0_avx2(-cospi[45], cospi[19], x[50], x[45], x[50]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1514|   100k|  btf_16_w16_0_avx2(cospi[35], cospi[29], x[46], x[46], x[49]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1515|   100k|  btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1516|       |
 1517|       |  // stage 3
 1518|   100k|  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1519|   100k|  btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1520|   100k|  btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1521|   100k|  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1522|   100k|  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1523|   100k|  btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1524|   100k|  btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1525|   100k|  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1526|   100k|  btf_16_adds_subs_avx2(&x[32], &x[33]);
 1527|   100k|  btf_16_adds_subs_avx2(&x[35], &x[34]);
 1528|   100k|  btf_16_adds_subs_avx2(&x[36], &x[37]);
 1529|   100k|  btf_16_adds_subs_avx2(&x[39], &x[38]);
 1530|   100k|  btf_16_adds_subs_avx2(&x[40], &x[41]);
 1531|   100k|  btf_16_adds_subs_avx2(&x[43], &x[42]);
 1532|   100k|  btf_16_adds_subs_avx2(&x[44], &x[45]);
 1533|   100k|  btf_16_adds_subs_avx2(&x[47], &x[46]);
 1534|   100k|  btf_16_adds_subs_avx2(&x[48], &x[49]);
 1535|   100k|  btf_16_adds_subs_avx2(&x[51], &x[50]);
 1536|   100k|  btf_16_adds_subs_avx2(&x[52], &x[53]);
 1537|   100k|  btf_16_adds_subs_avx2(&x[55], &x[54]);
 1538|   100k|  btf_16_adds_subs_avx2(&x[56], &x[57]);
 1539|   100k|  btf_16_adds_subs_avx2(&x[59], &x[58]);
 1540|   100k|  btf_16_adds_subs_avx2(&x[60], &x[61]);
 1541|   100k|  btf_16_adds_subs_avx2(&x[63], &x[62]);
 1542|       |
 1543|       |  // stage 4
 1544|   100k|  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1545|   100k|  btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1546|   100k|  btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1547|   100k|  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1548|   100k|  btf_16_adds_subs_avx2(&x[16], &x[17]);
 1549|   100k|  btf_16_adds_subs_avx2(&x[19], &x[18]);
 1550|   100k|  btf_16_adds_subs_avx2(&x[20], &x[21]);
 1551|   100k|  btf_16_adds_subs_avx2(&x[23], &x[22]);
 1552|   100k|  btf_16_adds_subs_avx2(&x[24], &x[25]);
 1553|   100k|  btf_16_adds_subs_avx2(&x[27], &x[26]);
 1554|   100k|  btf_16_adds_subs_avx2(&x[28], &x[29]);
 1555|   100k|  btf_16_adds_subs_avx2(&x[31], &x[30]);
 1556|   100k|  idct64_stage4_high32_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   100k|#define INV_COS_BIT 12
  ------------------
 1557|       |
 1558|       |  // stage 5
 1559|   100k|  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1560|   100k|  btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1561|   100k|  btf_16_adds_subs_avx2(&x[8], &x[9]);
 1562|   100k|  btf_16_adds_subs_avx2(&x[11], &x[10]);
 1563|   100k|  btf_16_adds_subs_avx2(&x[12], &x[13]);
 1564|   100k|  btf_16_adds_subs_avx2(&x[15], &x[14]);
 1565|   100k|  idct64_stage5_high48_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   100k|#define INV_COS_BIT 12
  ------------------
 1566|       |
 1567|       |  // stage 6
 1568|   100k|  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1569|   100k|  btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
  ------------------
  |  |   30|   100k|  do {                                             \
  |  |   31|   100k|    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
  |  |   32|   100k|    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
  |  |   33|   100k|    const __m256i _in = in;                        \
  |  |   34|   100k|    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
  |  |   35|   100k|    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
  |  |   36|   100k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (36:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1570|   100k|  btf_16_adds_subs_avx2(&x[4], &x[5]);
 1571|   100k|  btf_16_adds_subs_avx2(&x[7], &x[6]);
 1572|   100k|  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT);
  ------------------
  |  |   43|   100k|#define INV_COS_BIT 12
  ------------------
 1573|   100k|  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r,
 1574|   100k|                  INV_COS_BIT);
  ------------------
  |  |   43|   100k|#define INV_COS_BIT 12
  ------------------
 1575|   100k|  idct64_stage6_high48_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   100k|#define INV_COS_BIT 12
  ------------------
 1576|       |
 1577|       |  // stage 7
 1578|   100k|  btf_16_adds_subs_avx2(&x[0], &x[3]);
 1579|   100k|  btf_16_adds_subs_avx2(&x[1], &x[2]);
 1580|   100k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, INV_COS_BIT);
  ------------------
  |  |   43|   100k|#define INV_COS_BIT 12
  ------------------
 1581|   100k|  btf_16_adds_subs_avx2(&x[8], &x[11]);
 1582|   100k|  btf_16_adds_subs_avx2(&x[9], &x[10]);
 1583|   100k|  btf_16_adds_subs_avx2(&x[15], &x[12]);
 1584|   100k|  btf_16_adds_subs_avx2(&x[14], &x[13]);
 1585|   100k|  idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   100k|#define INV_COS_BIT 12
  ------------------
 1586|       |
 1587|       |  // stage 8
 1588|   100k|  btf_16_adds_subs_avx2(&x[0], &x[7]);
 1589|   100k|  btf_16_adds_subs_avx2(&x[1], &x[6]);
 1590|   100k|  btf_16_adds_subs_avx2(&x[2], &x[5]);
 1591|   100k|  btf_16_adds_subs_avx2(&x[3], &x[4]);
 1592|   100k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r,
 1593|   100k|                  INV_COS_BIT);
  ------------------
  |  |   43|   100k|#define INV_COS_BIT 12
  ------------------
 1594|   100k|  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r,
 1595|   100k|                  INV_COS_BIT);
  ------------------
  |  |   43|   100k|#define INV_COS_BIT 12
  ------------------
 1596|   100k|  idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   100k|#define INV_COS_BIT 12
  ------------------
 1597|       |
 1598|       |  // stage 9~11
 1599|   100k|  idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   100k|#define INV_COS_BIT 12
  ------------------
 1600|   100k|  idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT);
  ------------------
  |  |   43|   100k|#define INV_COS_BIT 12
  ------------------
 1601|   100k|  idct64_stage11_avx2(output, x);
 1602|   100k|}
av1_inv_txfm_avx2.c:lowbd_inv_txfm2d_add_idtx_avx2:
 1768|  35.9k|                                                  int32_t eob) {
 1769|  35.9k|  (void)eob;
 1770|  35.9k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 1771|  35.9k|  const int txw_idx = get_txw_idx(tx_size);
 1772|  35.9k|  const int txh_idx = get_txh_idx(tx_size);
 1773|  35.9k|  const int txfm_size_col = tx_size_wide[tx_size];
 1774|  35.9k|  const int txfm_size_row = tx_size_high[tx_size];
 1775|  35.9k|  const int col_max = AOMMIN(32, txfm_size_col);
  ------------------
  |  |   34|  35.9k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 35.9k]
  |  |  ------------------
  ------------------
 1776|  35.9k|  const int row_max = AOMMIN(32, txfm_size_row);
  ------------------
  |  |   34|  35.9k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 35.9k]
  |  |  ------------------
  ------------------
 1777|  35.9k|  const int input_stride = row_max;
 1778|  35.9k|  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
 1779|  35.9k|  __m256i buf[32];
 1780|       |
 1781|  79.0k|  for (int i = 0; i < (col_max >> 4); ++i) {
  ------------------
  |  Branch (1781:19): [True: 43.1k, False: 35.9k]
  ------------------
 1782|  93.9k|    for (int j = 0; j < (row_max >> 4); j++) {
  ------------------
  |  Branch (1782:21): [True: 50.7k, False: 43.1k]
  ------------------
 1783|  50.7k|      iidentity_row_16xn_avx2(buf, input + j * 16 + i * 16 * input_stride,
 1784|  50.7k|                              row_max, shift[0], 16, txw_idx, rect_type);
 1785|  50.7k|      transpose_16bit_16x16_avx2(buf, buf);
 1786|  50.7k|      iidentity_col_16xn_avx2(output + i * 16 + j * 16 * stride, stride, buf,
 1787|  50.7k|                              shift[1], 16, txh_idx);
 1788|  50.7k|    }
 1789|  43.1k|  }
 1790|  35.9k|}
av1_inv_txfm_avx2.c:iidentity_row_16xn_avx2:
 1703|  56.0k|                                           int txw_idx, int rect_type) {
 1704|  56.0k|  const int32_t *input_row = input;
 1705|  56.0k|  const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]);
 1706|  56.0k|  const __m256i _r = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) +
  ------------------
  |  |   41|  56.0k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 1707|  56.0k|                                       (1 << (NewSqrt2Bits - shift - 1)));
  ------------------
  |  |   41|  56.0k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 1708|  56.0k|  const __m256i one = _mm256_set1_epi16(1);
 1709|  56.0k|  const __m256i scale__r = _mm256_unpacklo_epi16(scale, _r);
 1710|  56.0k|  if (rect_type != 1 && rect_type != -1) {
  ------------------
  |  Branch (1710:7): [True: 44.7k, False: 11.3k]
  |  Branch (1710:25): [True: 35.9k, False: 8.81k]
  ------------------
 1711|   610k|    for (int i = 0; i < height; ++i) {
  ------------------
  |  Branch (1711:21): [True: 574k, False: 35.9k]
  ------------------
 1712|   574k|      const __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
 1713|   574k|      input_row += stride;
 1714|   574k|      __m256i lo = _mm256_unpacklo_epi16(src, one);
 1715|   574k|      __m256i hi = _mm256_unpackhi_epi16(src, one);
 1716|   574k|      lo = _mm256_madd_epi16(lo, scale__r);
 1717|   574k|      hi = _mm256_madd_epi16(hi, scale__r);
 1718|   574k|      lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
  ------------------
  |  |   41|   574k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 1719|   574k|      hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
  ------------------
  |  |   41|   574k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 1720|   574k|      out[i] = _mm256_packs_epi32(lo, hi);
 1721|   574k|    }
 1722|  35.9k|  } else {
 1723|  20.1k|    const __m256i rect_scale =
 1724|  20.1k|        _mm256_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
  ------------------
  |  |   41|  20.1k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 1725|   342k|    for (int i = 0; i < height; ++i) {
  ------------------
  |  Branch (1725:21): [True: 322k, False: 20.1k]
  ------------------
 1726|   322k|      __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
 1727|   322k|      src = _mm256_mulhrs_epi16(src, rect_scale);
 1728|   322k|      input_row += stride;
 1729|   322k|      __m256i lo = _mm256_unpacklo_epi16(src, one);
 1730|   322k|      __m256i hi = _mm256_unpackhi_epi16(src, one);
 1731|   322k|      lo = _mm256_madd_epi16(lo, scale__r);
 1732|   322k|      hi = _mm256_madd_epi16(hi, scale__r);
 1733|   322k|      lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
  ------------------
  |  |   41|   322k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 1734|   322k|      hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
  ------------------
  |  |   41|   322k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 1735|   322k|      out[i] = _mm256_packs_epi32(lo, hi);
 1736|   322k|    }
 1737|  20.1k|  }
 1738|  56.0k|}
av1_inv_txfm_avx2.c:iidentity_col_16xn_avx2:
 1742|  69.1k|                                           int txh_idx) {
 1743|  69.1k|  const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]);
 1744|  69.1k|  const __m256i scale__r = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1));
  ------------------
  |  |   41|  69.1k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 1745|  69.1k|  const __m256i shift__r = _mm256_set1_epi32(1 << (-shift - 1));
 1746|  69.1k|  const __m256i one = _mm256_set1_epi16(1);
 1747|  69.1k|  const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale__r);
 1748|  1.17M|  for (int h = 0; h < height; ++h) {
  ------------------
  |  Branch (1748:19): [True: 1.10M, False: 69.1k]
  ------------------
 1749|  1.10M|    __m256i lo = _mm256_unpacklo_epi16(buf[h], one);
 1750|  1.10M|    __m256i hi = _mm256_unpackhi_epi16(buf[h], one);
 1751|  1.10M|    lo = _mm256_madd_epi16(lo, scale_coeff);
 1752|  1.10M|    hi = _mm256_madd_epi16(hi, scale_coeff);
 1753|  1.10M|    lo = _mm256_srai_epi32(lo, NewSqrt2Bits);
  ------------------
  |  |   41|  1.10M|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 1754|  1.10M|    hi = _mm256_srai_epi32(hi, NewSqrt2Bits);
  ------------------
  |  |   41|  1.10M|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 1755|  1.10M|    lo = _mm256_add_epi32(lo, shift__r);
 1756|  1.10M|    hi = _mm256_add_epi32(hi, shift__r);
 1757|  1.10M|    lo = _mm256_srai_epi32(lo, -shift);
 1758|  1.10M|    hi = _mm256_srai_epi32(hi, -shift);
 1759|  1.10M|    const __m256i x = _mm256_packs_epi32(lo, hi);
 1760|  1.10M|    write_recon_w16_avx2(x, output);
 1761|  1.10M|    output += stride;
 1762|  1.10M|  }
 1763|  69.1k|}
av1_inv_txfm_avx2.c:lowbd_inv_txfm2d_add_h_identity_avx2:
 1794|  5.30k|    TX_SIZE tx_size, int eob) {
 1795|  5.30k|  int eobx, eoby;
 1796|  5.30k|  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
 1797|  5.30k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 1798|  5.30k|  const int txw_idx = get_txw_idx(tx_size);
 1799|  5.30k|  const int txh_idx = get_txh_idx(tx_size);
 1800|  5.30k|  const int txfm_size_col = tx_size_wide[tx_size];
 1801|  5.30k|  const int txfm_size_row = tx_size_high[tx_size];
 1802|  5.30k|  const int txfm_size_row_notzero = AOMMIN(32, txfm_size_row);
  ------------------
  |  |   34|  5.30k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 5.30k]
  |  |  ------------------
  ------------------
 1803|  5.30k|  const int input_stride = txfm_size_row_notzero;
 1804|  5.30k|  const int buf_size_w_div16 = (eobx + 16) >> 4;
 1805|  5.30k|  const int buf_size_h_div16 = (eoby + 16) >> 4;
 1806|  5.30k|  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
 1807|       |
 1808|  5.30k|  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
 1809|  5.30k|  const transform_1d_avx2 col_txfm =
 1810|  5.30k|      lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
 1811|       |
 1812|  5.30k|  assert(col_txfm != NULL);
 1813|       |
 1814|  5.30k|  int ud_flip, lr_flip;
 1815|  5.30k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 1816|  10.6k|  for (int i = 0; i < buf_size_w_div16; i++) {
  ------------------
  |  Branch (1816:19): [True: 5.30k, False: 5.30k]
  ------------------
 1817|  5.30k|    __m256i buf0[64];
 1818|  10.6k|    for (int j = 0; j < buf_size_h_div16; j++) {
  ------------------
  |  Branch (1818:21): [True: 5.30k, False: 5.30k]
  ------------------
 1819|  5.30k|      __m256i *buf0_cur = buf0 + j * 16;
 1820|  5.30k|      const int32_t *input_cur = input + i * 16 * input_stride + j * 16;
 1821|  5.30k|      iidentity_row_16xn_avx2(buf0_cur, input_cur, input_stride, shift[0], 16,
 1822|  5.30k|                              txw_idx, rect_type);
 1823|  5.30k|      transpose_16bit_16x16_avx2(buf0_cur, buf0_cur);
 1824|  5.30k|    }
 1825|  5.30k|    col_txfm(buf0, buf0);
 1826|  5.30k|    __m256i mshift = _mm256_set1_epi16(1 << (15 + shift[1]));
 1827|  5.30k|    int k = ud_flip ? (txfm_size_row - 1) : 0;
  ------------------
  |  Branch (1827:13): [True: 0, False: 5.30k]
  ------------------
 1828|  5.30k|    const int step = ud_flip ? -1 : 1;
  ------------------
  |  Branch (1828:22): [True: 0, False: 5.30k]
  ------------------
 1829|  90.1k|    for (int j = 0; j < txfm_size_row; ++j, k += step) {
  ------------------
  |  Branch (1829:21): [True: 84.8k, False: 5.30k]
  ------------------
 1830|  84.8k|      __m256i res = _mm256_mulhrs_epi16(buf0[k], mshift);
 1831|  84.8k|      write_recon_w16_avx2(res, output + (i << 4) + j * stride);
 1832|  84.8k|    }
 1833|  5.30k|  }
 1834|  5.30k|}
av1_inv_txfm_avx2.c:lowbd_inv_txfm2d_add_v_identity_avx2:
 1838|  18.3k|    TX_SIZE tx_size, int eob) {
 1839|  18.3k|  __m256i buf1[64];
 1840|  18.3k|  int eobx, eoby;
 1841|  18.3k|  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
 1842|  18.3k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 1843|  18.3k|  const int txw_idx = get_txw_idx(tx_size);
 1844|  18.3k|  const int txh_idx = get_txh_idx(tx_size);
 1845|  18.3k|  const int txfm_size_col = tx_size_wide[tx_size];
 1846|  18.3k|  const int txfm_size_row = tx_size_high[tx_size];
 1847|  18.3k|  const int buf_size_w_div16 = txfm_size_col >> 4;
 1848|  18.3k|  const int buf_size_h_div16 = (eoby + 16) >> 4;
 1849|  18.3k|  const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3;
 1850|  18.3k|  const int input_stride = AOMMIN(32, txfm_size_row);
  ------------------
  |  |   34|  18.3k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 18.3k]
  |  |  ------------------
  ------------------
 1851|  18.3k|  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
 1852|       |
 1853|  18.3k|  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
 1854|  18.3k|  const transform_1d_avx2 row_txfm =
 1855|  18.3k|      lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
 1856|       |
 1857|  18.3k|  assert(row_txfm != NULL);
 1858|       |
 1859|  18.3k|  int ud_flip, lr_flip;
 1860|  18.3k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 1861|  36.7k|  for (int i = 0; i < buf_size_h_div16; i++) {
  ------------------
  |  Branch (1861:19): [True: 18.3k, False: 18.3k]
  ------------------
 1862|  18.3k|    __m256i buf0[64];
 1863|  18.3k|    load_buffer_32bit_to_16bit_w16_avx2(input + i * 16, input_stride, buf0,
 1864|  18.3k|                                        buf_size_nonzero_w);
 1865|  18.3k|    if (rect_type == 1 || rect_type == -1) {
  ------------------
  |  Branch (1865:9): [True: 6, False: 18.3k]
  |  Branch (1865:27): [True: 0, False: 18.3k]
  ------------------
 1866|      0|      round_shift_avx2(buf0, buf0, buf_size_nonzero_w);  // rect special code
 1867|      0|    }
 1868|  18.3k|    row_txfm(buf0, buf0);
 1869|  18.3k|    round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
 1870|  18.3k|    __m256i *_buf1 = buf1;
 1871|  18.3k|    if (lr_flip) {
  ------------------
  |  Branch (1871:9): [True: 0, False: 18.3k]
  ------------------
 1872|      0|      for (int j = 0; j < buf_size_w_div16; ++j) {
  ------------------
  |  Branch (1872:23): [True: 0, False: 0]
  ------------------
 1873|      0|        __m256i temp[16];
 1874|      0|        flip_buf_avx2(buf0 + 16 * j, temp, 16);
 1875|      0|        transpose_16bit_16x16_avx2(temp,
 1876|      0|                                   _buf1 + 16 * (buf_size_w_div16 - 1 - j));
 1877|      0|      }
 1878|  18.3k|    } else {
 1879|  36.7k|      for (int j = 0; j < buf_size_w_div16; ++j) {
  ------------------
  |  Branch (1879:23): [True: 18.3k, False: 18.3k]
  ------------------
 1880|  18.3k|        transpose_16bit_16x16_avx2(buf0 + 16 * j, _buf1 + 16 * j);
 1881|  18.3k|      }
 1882|  18.3k|    }
 1883|  36.7k|    for (int j = 0; j < buf_size_w_div16; ++j) {
  ------------------
  |  Branch (1883:21): [True: 18.3k, False: 18.3k]
  ------------------
 1884|  18.3k|      iidentity_col_16xn_avx2(output + i * 16 * stride + j * 16, stride,
 1885|  18.3k|                              buf1 + j * 16, shift[1], 16, txh_idx);
 1886|  18.3k|    }
 1887|  18.3k|  }
 1888|  18.3k|}

av1_inv_txfm_avx2.c:round_shift_avx2:
   39|   260k|                                    int size) {
   40|   260k|  const __m256i scale = _mm256_set1_epi16(NewInvSqrt2 * 8);
   41|  4.62M|  for (int i = 0; i < size; ++i) {
  ------------------
  |  Branch (41:19): [True: 4.36M, False: 260k]
  ------------------
   42|  4.36M|    output[i] = _mm256_mulhrs_epi16(input[i], scale);
   43|  4.36M|  }
   44|   260k|}
av1_inv_txfm_avx2.c:lowbd_write_buffer_16xn_avx2:
   56|  2.76M|                                                int height) {
   57|  2.76M|  int j = flipud ? (height - 1) : 0;
  ------------------
  |  Branch (57:11): [True: 22.6k, False: 2.73M]
  ------------------
   58|  2.76M|  const int step = flipud ? -1 : 1;
  ------------------
  |  Branch (58:20): [True: 22.6k, False: 2.73M]
  ------------------
   59|  77.5M|  for (int i = 0; i < height; ++i, j += step) {
  ------------------
  |  Branch (59:19): [True: 74.7M, False: 2.76M]
  ------------------
   60|  74.7M|    write_recon_w16_avx2(in[j], output + i * stride);
   61|  74.7M|  }
   62|  2.76M|}
av1_inv_txfm_avx2.c:write_recon_w16_avx2:
   46|  75.9M|static inline void write_recon_w16_avx2(__m256i res, uint8_t *output) {
   47|  75.9M|  __m128i pred = _mm_loadu_si128((__m128i const *)(output));
   48|  75.9M|  __m256i u = _mm256_adds_epi16(_mm256_cvtepu8_epi16(pred), res);
   49|  75.9M|  __m128i y = _mm256_castsi256_si128(
   50|  75.9M|      _mm256_permute4x64_epi64(_mm256_packus_epi16(u, u), 168));
   51|  75.9M|  _mm_storeu_si128((__m128i *)(output), y);
   52|  75.9M|}

av1_idct8_low1_ssse3:
   79|  1.18M|void av1_idct8_low1_ssse3(const __m128i *input, __m128i *output) {
   80|  1.18M|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  1.18M|#define INV_COS_BIT 12
  ------------------
   81|       |
   82|       |  // stage 1
   83|  1.18M|  __m128i x[2];
   84|  1.18M|  x[0] = input[0];
   85|       |
   86|       |  // stage 2
   87|       |  // stage 3
   88|  1.18M|  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
  ------------------
  |  |   28|  1.18M|  do {                                          \
  |  |   29|  1.18M|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  1.18M|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  1.18M|    const __m128i _in = in;                     \
  |  |   32|  1.18M|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  1.18M|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  1.18M|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
   89|       |
   90|       |  // stage 4
   91|       |  // stage 5
   92|  1.18M|  output[0] = x[0];
   93|  1.18M|  output[7] = x[0];
   94|  1.18M|  output[1] = x[1];
   95|  1.18M|  output[6] = x[1];
   96|  1.18M|  output[2] = x[1];
   97|  1.18M|  output[5] = x[1];
   98|  1.18M|  output[3] = x[0];
   99|  1.18M|  output[4] = x[0];
  100|  1.18M|}
av1_idct8_sse2:
  102|  2.16M|void av1_idct8_sse2(const __m128i *input, __m128i *output) {
  103|  2.16M|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|  2.16M|#define INV_COS_BIT 12
  ------------------
  104|  2.16M|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  2.16M|#define INV_COS_BIT 12
  ------------------
  105|  2.16M|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  2.16M|#define INV_COS_BIT 12
  ------------------
  106|       |
  107|  2.16M|  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
  ------------------
  |  |   20|  2.16M|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  108|  2.16M|  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
  ------------------
  |  |   20|  2.16M|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  109|  2.16M|  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
  ------------------
  |  |   20|  2.16M|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  110|  2.16M|  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
  ------------------
  |  |   20|  2.16M|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  111|  2.16M|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|  2.16M|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  112|  2.16M|  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
  ------------------
  |  |   20|  2.16M|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  113|  2.16M|  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
  ------------------
  |  |   20|  2.16M|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  114|  2.16M|  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
  ------------------
  |  |   20|  2.16M|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  115|  2.16M|  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
  ------------------
  |  |   20|  2.16M|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  116|       |
  117|       |  // stage 1
  118|  2.16M|  __m128i x[8];
  119|  2.16M|  x[0] = input[0];
  120|  2.16M|  x[1] = input[4];
  121|  2.16M|  x[2] = input[2];
  122|  2.16M|  x[3] = input[6];
  123|  2.16M|  x[4] = input[1];
  124|  2.16M|  x[5] = input[5];
  125|  2.16M|  x[6] = input[3];
  126|  2.16M|  x[7] = input[7];
  127|       |
  128|       |  // stage 2
  129|  2.16M|  btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
  ------------------
  |  |   61|  2.16M|  do {                                            \
  |  |   62|  2.16M|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  2.16M|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  2.16M|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  2.16M|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  2.16M|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  2.16M|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  2.16M|                                                  \
  |  |   69|  2.16M|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  2.16M|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  2.16M|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  2.16M|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  2.16M|                                                  \
  |  |   74|  2.16M|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  2.16M|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  2.16M|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  2.16M|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  2.16M|                                                  \
  |  |   79|  2.16M|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  2.16M|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  2.16M|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  130|  2.16M|  btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
  ------------------
  |  |   61|  2.16M|  do {                                            \
  |  |   62|  2.16M|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  2.16M|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  2.16M|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  2.16M|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  2.16M|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  2.16M|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  2.16M|                                                  \
  |  |   69|  2.16M|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  2.16M|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  2.16M|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  2.16M|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  2.16M|                                                  \
  |  |   74|  2.16M|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  2.16M|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  2.16M|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  2.16M|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  2.16M|                                                  \
  |  |   79|  2.16M|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  2.16M|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  2.16M|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  131|       |
  132|       |  // stage 3
  133|  2.16M|  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
  ------------------
  |  |   61|  2.16M|  do {                                            \
  |  |   62|  2.16M|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  2.16M|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  2.16M|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  2.16M|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  2.16M|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  2.16M|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  2.16M|                                                  \
  |  |   69|  2.16M|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  2.16M|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  2.16M|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  2.16M|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  2.16M|                                                  \
  |  |   74|  2.16M|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  2.16M|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  2.16M|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  2.16M|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  2.16M|                                                  \
  |  |   79|  2.16M|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  2.16M|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  2.16M|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  134|  2.16M|  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   61|  2.16M|  do {                                            \
  |  |   62|  2.16M|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  2.16M|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  2.16M|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  2.16M|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  2.16M|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  2.16M|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  2.16M|                                                  \
  |  |   69|  2.16M|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  2.16M|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  2.16M|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  2.16M|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  2.16M|                                                  \
  |  |   74|  2.16M|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  2.16M|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  2.16M|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  2.16M|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  2.16M|                                                  \
  |  |   79|  2.16M|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  2.16M|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  2.16M|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  135|  2.16M|  btf_16_adds_subs_sse2(x[4], x[5]);
  ------------------
  |  |   37|  2.16M|  do {                                  \
  |  |   38|  2.16M|    const __m128i _in0 = in0;           \
  |  |   39|  2.16M|    const __m128i _in1 = in1;           \
  |  |   40|  2.16M|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  2.16M|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  2.16M|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  136|  2.16M|  btf_16_subs_adds_sse2(x[7], x[6]);
  ------------------
  |  |   45|  2.16M|  do {                                  \
  |  |   46|  2.16M|    const __m128i _in0 = in0;           \
  |  |   47|  2.16M|    const __m128i _in1 = in1;           \
  |  |   48|  2.16M|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  2.16M|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  2.16M|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  137|       |
  138|       |  // stage 4
  139|  2.16M|  btf_16_adds_subs_sse2(x[0], x[3]);
  ------------------
  |  |   37|  2.16M|  do {                                  \
  |  |   38|  2.16M|    const __m128i _in0 = in0;           \
  |  |   39|  2.16M|    const __m128i _in1 = in1;           \
  |  |   40|  2.16M|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  2.16M|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  2.16M|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  140|  2.16M|  btf_16_adds_subs_sse2(x[1], x[2]);
  ------------------
  |  |   37|  2.16M|  do {                                  \
  |  |   38|  2.16M|    const __m128i _in0 = in0;           \
  |  |   39|  2.16M|    const __m128i _in1 = in1;           \
  |  |   40|  2.16M|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  2.16M|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  2.16M|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  141|  2.16M|  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
  ------------------
  |  |   61|  2.16M|  do {                                            \
  |  |   62|  2.16M|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  2.16M|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  2.16M|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  2.16M|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  2.16M|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  2.16M|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  2.16M|                                                  \
  |  |   69|  2.16M|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  2.16M|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  2.16M|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  2.16M|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  2.16M|                                                  \
  |  |   74|  2.16M|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  2.16M|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  2.16M|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  2.16M|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  2.16M|                                                  \
  |  |   79|  2.16M|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  2.16M|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  2.16M|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  142|       |
  143|       |  // stage 5
  144|  2.16M|  btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
  ------------------
  |  |   53|  2.16M|  do {                                                  \
  |  |   54|  2.16M|    const __m128i _in0 = in0;                           \
  |  |   55|  2.16M|    const __m128i _in1 = in1;                           \
  |  |   56|  2.16M|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  2.16M|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  2.16M|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  145|  2.16M|  btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
  ------------------
  |  |   53|  2.16M|  do {                                                  \
  |  |   54|  2.16M|    const __m128i _in0 = in0;                           \
  |  |   55|  2.16M|    const __m128i _in1 = in1;                           \
  |  |   56|  2.16M|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  2.16M|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  2.16M|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  146|  2.16M|  btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
  ------------------
  |  |   53|  2.16M|  do {                                                  \
  |  |   54|  2.16M|    const __m128i _in0 = in0;                           \
  |  |   55|  2.16M|    const __m128i _in1 = in1;                           \
  |  |   56|  2.16M|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  2.16M|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  2.16M|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  147|  2.16M|  btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
  ------------------
  |  |   53|  2.16M|  do {                                                  \
  |  |   54|  2.16M|    const __m128i _in0 = in0;                           \
  |  |   55|  2.16M|    const __m128i _in1 = in1;                           \
  |  |   56|  2.16M|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  2.16M|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  2.16M|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  148|  2.16M|}
av1_iadst8_low1_ssse3:
 1701|   307k|void av1_iadst8_low1_ssse3(const __m128i *input, __m128i *output) {
 1702|   307k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|   307k|#define INV_COS_BIT 12
  ------------------
 1703|   307k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   307k|#define INV_COS_BIT 12
  ------------------
 1704|   307k|  const __m128i __zero = _mm_setzero_si128();
 1705|   307k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|   307k|#define INV_COS_BIT 12
  ------------------
 1706|       |
 1707|   307k|  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
  ------------------
  |  |   20|   307k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1708|   307k|  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
  ------------------
  |  |   20|   307k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1709|   307k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|   307k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1710|   307k|  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
  ------------------
  |  |   20|   307k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1711|       |
 1712|       |  // stage 1
 1713|   307k|  __m128i x[8];
 1714|   307k|  x[1] = input[0];
 1715|       |
 1716|       |  // stage 2
 1717|   307k|  btf_16_ssse3(cospi[60], -cospi[4], x[1], x[0], x[1]);
  ------------------
  |  |   28|   307k|  do {                                          \
  |  |   29|   307k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|   307k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|   307k|    const __m128i _in = in;                     \
  |  |   32|   307k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|   307k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|   307k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1718|       |
 1719|       |  // stage 3
 1720|   307k|  x[4] = x[0];
 1721|   307k|  x[5] = x[1];
 1722|       |
 1723|       |  // stage 4
 1724|   307k|  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
  ------------------
  |  |   61|   307k|  do {                                            \
  |  |   62|   307k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   307k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   307k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   307k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   307k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   307k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   307k|                                                  \
  |  |   69|   307k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   307k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   307k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   307k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   307k|                                                  \
  |  |   74|   307k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   307k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   307k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   307k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   307k|                                                  \
  |  |   79|   307k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   307k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   307k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1725|       |
 1726|       |  // stage 5
 1727|   307k|  x[2] = x[0];
 1728|   307k|  x[3] = x[1];
 1729|   307k|  x[6] = x[4];
 1730|   307k|  x[7] = x[5];
 1731|       |
 1732|       |  // stage 6
 1733|   307k|  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   61|   307k|  do {                                            \
  |  |   62|   307k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   307k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   307k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   307k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   307k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   307k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   307k|                                                  \
  |  |   69|   307k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   307k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   307k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   307k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   307k|                                                  \
  |  |   74|   307k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   307k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   307k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   307k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   307k|                                                  \
  |  |   79|   307k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   307k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   307k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1734|   307k|  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
  ------------------
  |  |   61|   307k|  do {                                            \
  |  |   62|   307k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   307k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   307k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   307k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   307k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   307k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   307k|                                                  \
  |  |   69|   307k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   307k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   307k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   307k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   307k|                                                  \
  |  |   74|   307k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   307k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   307k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   307k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   307k|                                                  \
  |  |   79|   307k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   307k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   307k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1735|       |
 1736|       |  // stage 7
 1737|   307k|  output[0] = x[0];
 1738|   307k|  output[1] = _mm_subs_epi16(__zero, x[4]);
 1739|   307k|  output[2] = x[6];
 1740|   307k|  output[3] = _mm_subs_epi16(__zero, x[2]);
 1741|   307k|  output[4] = x[3];
 1742|   307k|  output[5] = _mm_subs_epi16(__zero, x[7]);
 1743|   307k|  output[6] = x[5];
 1744|   307k|  output[7] = _mm_subs_epi16(__zero, x[1]);
 1745|   307k|}
av1_iadst8_sse2:
 1747|   666k|void av1_iadst8_sse2(const __m128i *input, __m128i *output) {
 1748|   666k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|   666k|#define INV_COS_BIT 12
  ------------------
 1749|   666k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   666k|#define INV_COS_BIT 12
  ------------------
 1750|   666k|  const __m128i __zero = _mm_setzero_si128();
 1751|   666k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|   666k|#define INV_COS_BIT 12
  ------------------
 1752|       |
 1753|   666k|  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
  ------------------
  |  |   20|   666k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1754|   666k|  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
  ------------------
  |  |   20|   666k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1755|   666k|  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
  ------------------
  |  |   20|   666k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1756|   666k|  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
  ------------------
  |  |   20|   666k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1757|   666k|  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
  ------------------
  |  |   20|   666k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1758|   666k|  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
  ------------------
  |  |   20|   666k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1759|   666k|  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
  ------------------
  |  |   20|   666k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1760|   666k|  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
  ------------------
  |  |   20|   666k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1761|   666k|  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
  ------------------
  |  |   20|   666k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1762|   666k|  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
  ------------------
  |  |   20|   666k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1763|   666k|  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
  ------------------
  |  |   20|   666k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1764|   666k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|   666k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1765|   666k|  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
  ------------------
  |  |   20|   666k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1766|       |
 1767|       |  // stage 1
 1768|   666k|  __m128i x[8];
 1769|   666k|  x[0] = input[7];
 1770|   666k|  x[1] = input[0];
 1771|   666k|  x[2] = input[5];
 1772|   666k|  x[3] = input[2];
 1773|   666k|  x[4] = input[3];
 1774|   666k|  x[5] = input[4];
 1775|   666k|  x[6] = input[1];
 1776|   666k|  x[7] = input[6];
 1777|       |
 1778|       |  // stage 2
 1779|   666k|  btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
  ------------------
  |  |   61|   666k|  do {                                            \
  |  |   62|   666k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   666k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   666k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   666k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   666k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   666k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   666k|                                                  \
  |  |   69|   666k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   666k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   666k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   666k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   666k|                                                  \
  |  |   74|   666k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   666k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   666k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   666k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   666k|                                                  \
  |  |   79|   666k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   666k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   666k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1780|   666k|  btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   61|   666k|  do {                                            \
  |  |   62|   666k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   666k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   666k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   666k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   666k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   666k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   666k|                                                  \
  |  |   69|   666k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   666k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   666k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   666k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   666k|                                                  \
  |  |   74|   666k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   666k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   666k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   666k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   666k|                                                  \
  |  |   79|   666k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   666k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   666k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1781|   666k|  btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
  ------------------
  |  |   61|   666k|  do {                                            \
  |  |   62|   666k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   666k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   666k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   666k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   666k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   666k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   666k|                                                  \
  |  |   69|   666k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   666k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   666k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   666k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   666k|                                                  \
  |  |   74|   666k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   666k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   666k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   666k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   666k|                                                  \
  |  |   79|   666k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   666k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   666k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1782|   666k|  btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
  ------------------
  |  |   61|   666k|  do {                                            \
  |  |   62|   666k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   666k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   666k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   666k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   666k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   666k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   666k|                                                  \
  |  |   69|   666k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   666k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   666k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   666k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   666k|                                                  \
  |  |   74|   666k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   666k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   666k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   666k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   666k|                                                  \
  |  |   79|   666k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   666k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   666k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1783|       |
 1784|       |  // stage 3
 1785|   666k|  btf_16_adds_subs_sse2(x[0], x[4]);
  ------------------
  |  |   37|   666k|  do {                                  \
  |  |   38|   666k|    const __m128i _in0 = in0;           \
  |  |   39|   666k|    const __m128i _in1 = in1;           \
  |  |   40|   666k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   666k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   666k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1786|   666k|  btf_16_adds_subs_sse2(x[1], x[5]);
  ------------------
  |  |   37|   666k|  do {                                  \
  |  |   38|   666k|    const __m128i _in0 = in0;           \
  |  |   39|   666k|    const __m128i _in1 = in1;           \
  |  |   40|   666k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   666k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   666k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1787|   666k|  btf_16_adds_subs_sse2(x[2], x[6]);
  ------------------
  |  |   37|   666k|  do {                                  \
  |  |   38|   666k|    const __m128i _in0 = in0;           \
  |  |   39|   666k|    const __m128i _in1 = in1;           \
  |  |   40|   666k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   666k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   666k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1788|   666k|  btf_16_adds_subs_sse2(x[3], x[7]);
  ------------------
  |  |   37|   666k|  do {                                  \
  |  |   38|   666k|    const __m128i _in0 = in0;           \
  |  |   39|   666k|    const __m128i _in1 = in1;           \
  |  |   40|   666k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   666k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   666k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1789|       |
 1790|       |  // stage 4
 1791|   666k|  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
  ------------------
  |  |   61|   666k|  do {                                            \
  |  |   62|   666k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   666k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   666k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   666k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   666k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   666k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   666k|                                                  \
  |  |   69|   666k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   666k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   666k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   666k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   666k|                                                  \
  |  |   74|   666k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   666k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   666k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   666k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   666k|                                                  \
  |  |   79|   666k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   666k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   666k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1792|   666k|  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
  ------------------
  |  |   61|   666k|  do {                                            \
  |  |   62|   666k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   666k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   666k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   666k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   666k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   666k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   666k|                                                  \
  |  |   69|   666k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   666k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   666k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   666k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   666k|                                                  \
  |  |   74|   666k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   666k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   666k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   666k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   666k|                                                  \
  |  |   79|   666k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   666k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   666k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1793|       |
 1794|       |  // stage 5
 1795|   666k|  btf_16_adds_subs_sse2(x[0], x[2]);
  ------------------
  |  |   37|   666k|  do {                                  \
  |  |   38|   666k|    const __m128i _in0 = in0;           \
  |  |   39|   666k|    const __m128i _in1 = in1;           \
  |  |   40|   666k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   666k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   666k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1796|   666k|  btf_16_adds_subs_sse2(x[1], x[3]);
  ------------------
  |  |   37|   666k|  do {                                  \
  |  |   38|   666k|    const __m128i _in0 = in0;           \
  |  |   39|   666k|    const __m128i _in1 = in1;           \
  |  |   40|   666k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   666k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   666k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1797|   666k|  btf_16_adds_subs_sse2(x[4], x[6]);
  ------------------
  |  |   37|   666k|  do {                                  \
  |  |   38|   666k|    const __m128i _in0 = in0;           \
  |  |   39|   666k|    const __m128i _in1 = in1;           \
  |  |   40|   666k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   666k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   666k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1798|   666k|  btf_16_adds_subs_sse2(x[5], x[7]);
  ------------------
  |  |   37|   666k|  do {                                  \
  |  |   38|   666k|    const __m128i _in0 = in0;           \
  |  |   39|   666k|    const __m128i _in1 = in1;           \
  |  |   40|   666k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   666k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   666k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1799|       |
 1800|       |  // stage 6
 1801|   666k|  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   61|   666k|  do {                                            \
  |  |   62|   666k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   666k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   666k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   666k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   666k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   666k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   666k|                                                  \
  |  |   69|   666k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   666k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   666k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   666k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   666k|                                                  \
  |  |   74|   666k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   666k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   666k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   666k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   666k|                                                  \
  |  |   79|   666k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   666k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   666k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1802|   666k|  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
  ------------------
  |  |   61|   666k|  do {                                            \
  |  |   62|   666k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   666k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   666k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   666k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   666k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   666k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   666k|                                                  \
  |  |   69|   666k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   666k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   666k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   666k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   666k|                                                  \
  |  |   74|   666k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   666k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   666k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   666k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   666k|                                                  \
  |  |   79|   666k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   666k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   666k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1803|       |
 1804|       |  // stage 7
 1805|   666k|  output[0] = x[0];
 1806|   666k|  output[1] = _mm_subs_epi16(__zero, x[4]);
 1807|   666k|  output[2] = x[6];
 1808|   666k|  output[3] = _mm_subs_epi16(__zero, x[2]);
 1809|   666k|  output[4] = x[3];
 1810|   666k|  output[5] = _mm_subs_epi16(__zero, x[7]);
 1811|   666k|  output[6] = x[5];
 1812|   666k|  output[7] = _mm_subs_epi16(__zero, x[1]);
 1813|   666k|}
av1_lowbd_inv_txfm2d_add_idtx_ssse3:
 2386|   164k|                                         int stride, TX_SIZE tx_size) {
 2387|   164k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 2388|   164k|  const int txw_idx = get_txw_idx(tx_size);
 2389|   164k|  const int txh_idx = get_txh_idx(tx_size);
 2390|   164k|  const int txfm_size_col = tx_size_wide[tx_size];
 2391|   164k|  const int txfm_size_row = tx_size_high[tx_size];
 2392|   164k|  const int col_max = AOMMIN(32, txfm_size_col);
  ------------------
  |  |   34|   164k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 164k]
  |  |  ------------------
  ------------------
 2393|   164k|  const int row_max = AOMMIN(32, txfm_size_row);
  ------------------
  |  |   34|   164k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 164k]
  |  |  ------------------
  ------------------
 2394|   164k|  const int input_stride = row_max;
 2395|   164k|  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
 2396|       |
 2397|   406k|  for (int i = 0; i < (col_max >> 3); ++i) {
  ------------------
  |  Branch (2397:19): [True: 242k, False: 164k]
  ------------------
 2398|   534k|    for (int j = 0; j < (row_max >> 3); j++) {
  ------------------
  |  Branch (2398:21): [True: 292k, False: 242k]
  ------------------
 2399|   292k|      __m128i buf[8];
 2400|   292k|      iidentity_row_8xn_ssse3(buf, input + j * 8 + i * 8 * input_stride,
 2401|   292k|                              row_max, shift[0], 8, txw_idx, rect_type);
 2402|   292k|      transpose_16bit_8x8(buf, buf);
 2403|   292k|      iidentity_col_8xn_ssse3(output + i * 8 + j * 8 * stride, stride, buf,
 2404|   292k|                              shift[1], 8, txh_idx);
 2405|   292k|    }
 2406|   242k|  }
 2407|   164k|}
av1_lowbd_inv_txfm2d_add_h_identity_ssse3:
 2544|  56.4k|                                               int eob) {
 2545|  56.4k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 2546|  56.4k|  int eobx, eoby;
 2547|  56.4k|  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
 2548|  56.4k|  const int txw_idx = get_txw_idx(tx_size);
 2549|  56.4k|  const int txh_idx = get_txh_idx(tx_size);
 2550|  56.4k|  const int txfm_size_col = tx_size_wide[tx_size];
 2551|  56.4k|  const int txfm_size_row = tx_size_high[tx_size];
 2552|  56.4k|  const int buf_size_w_div8 = (eobx + 8) >> 3;
 2553|  56.4k|  const int buf_size_h_div8 = (eoby + 8) >> 3;
 2554|  56.4k|  const int input_stride = AOMMIN(32, txfm_size_row);
  ------------------
  |  |   34|  56.4k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 56.4k]
  |  |  ------------------
  ------------------
 2555|  56.4k|  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
 2556|       |
 2557|  56.4k|  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
 2558|  56.4k|  assert(fun_idx < 5);
 2559|  56.4k|  const transform_1d_ssse3 col_txfm =
 2560|  56.4k|      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
 2561|       |
 2562|  56.4k|  assert(col_txfm != NULL);
 2563|       |
 2564|  56.4k|  int ud_flip, lr_flip;
 2565|  56.4k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 2566|   124k|  for (int i = 0; i < buf_size_w_div8; i++) {
  ------------------
  |  Branch (2566:19): [True: 67.8k, False: 56.4k]
  ------------------
 2567|  67.8k|    __m128i buf0[64];
 2568|   138k|    for (int j = 0; j < buf_size_h_div8; j++) {
  ------------------
  |  Branch (2568:21): [True: 70.3k, False: 67.8k]
  ------------------
 2569|  70.3k|      __m128i *buf0_cur = buf0 + j * 8;
 2570|  70.3k|      const int32_t *input_cur = input + i * 8 * input_stride + j * 8;
 2571|  70.3k|      iidentity_row_8xn_ssse3(buf0_cur, input_cur, input_stride, shift[0], 8,
 2572|  70.3k|                              txw_idx, rect_type);
 2573|  70.3k|      transpose_16bit_8x8(buf0_cur, buf0_cur);
 2574|  70.3k|    }
 2575|  67.8k|    col_txfm(buf0, buf0);
 2576|  67.8k|    __m128i mshift = _mm_set1_epi16(1 << (15 + shift[1]));
 2577|  67.8k|    int k = ud_flip ? (txfm_size_row - 1) : 0;
  ------------------
  |  Branch (2577:13): [True: 11.8k, False: 56.0k]
  ------------------
 2578|  67.8k|    const int step = ud_flip ? -1 : 1;
  ------------------
  |  Branch (2578:22): [True: 11.8k, False: 56.0k]
  ------------------
 2579|  67.8k|    uint8_t *out = output + 8 * i;
 2580|   695k|    for (int j = 0; j < txfm_size_row; ++j, k += step) {
  ------------------
  |  Branch (2580:21): [True: 627k, False: 67.8k]
  ------------------
 2581|   627k|      const __m128i v = _mm_loadl_epi64((__m128i const *)(out));
 2582|   627k|      __m128i res = _mm_mulhrs_epi16(buf0[k], mshift);
 2583|   627k|      const __m128i u = lowbd_get_recon_8x8_sse2(v, res);
 2584|   627k|      _mm_storel_epi64((__m128i *)(out), u);
 2585|   627k|      out += stride;
 2586|   627k|    }
 2587|  67.8k|  }
 2588|  56.4k|}
av1_lowbd_inv_txfm2d_add_v_identity_ssse3:
 2593|   123k|                                               int eob) {
 2594|   123k|  __m128i buf1[64];
 2595|   123k|  int eobx, eoby;
 2596|   123k|  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
 2597|   123k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 2598|   123k|  const int txw_idx = get_txw_idx(tx_size);
 2599|   123k|  const int txh_idx = get_txh_idx(tx_size);
 2600|   123k|  const int txfm_size_col = tx_size_wide[tx_size];
 2601|   123k|  const int txfm_size_row = tx_size_high[tx_size];
 2602|   123k|  const int buf_size_w_div8 = txfm_size_col >> 3;
 2603|   123k|  const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3;
 2604|   123k|  const int buf_size_h_div8 = (eoby + 8) >> 3;
 2605|   123k|  const int input_stride = AOMMIN(32, txfm_size_row);
  ------------------
  |  |   34|   123k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 123k]
  |  |  ------------------
  ------------------
 2606|   123k|  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
 2607|       |
 2608|   123k|  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
 2609|   123k|  const transform_1d_ssse3 row_txfm =
 2610|   123k|      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
 2611|       |
 2612|   123k|  assert(row_txfm != NULL);
 2613|   123k|  int ud_flip, lr_flip;
 2614|   123k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 2615|   260k|  for (int i = 0; i < buf_size_h_div8; i++) {
  ------------------
  |  Branch (2615:19): [True: 136k, False: 123k]
  ------------------
 2616|   136k|    __m128i buf0[64];
 2617|   136k|    load_buffer_32bit_to_16bit(input + i * 8, input_stride, buf0,
 2618|   136k|                               buf_size_nonzero_w);
 2619|   136k|    if (rect_type == 1 || rect_type == -1) {
  ------------------
  |  Branch (2619:9): [True: 39.4k, False: 97.0k]
  |  Branch (2619:27): [True: 29.3k, False: 67.6k]
  ------------------
 2620|  68.7k|      round_shift_ssse3(buf0, buf0, buf_size_nonzero_w);  // rect special code
 2621|  68.7k|    }
 2622|   136k|    row_txfm(buf0, buf0);
 2623|   136k|    round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
 2624|   136k|    __m128i *_buf1 = buf1;
 2625|   136k|    if (lr_flip) {
  ------------------
  |  Branch (2625:9): [True: 10.2k, False: 126k]
  ------------------
 2626|  24.8k|      for (int j = 0; j < buf_size_w_div8; ++j) {
  ------------------
  |  Branch (2626:23): [True: 14.6k, False: 10.2k]
  ------------------
 2627|  14.6k|        __m128i temp[8];
 2628|  14.6k|        flip_buf_sse2(buf0 + 8 * j, temp, 8);
 2629|  14.6k|        transpose_16bit_8x8(temp, _buf1 + 8 * (buf_size_w_div8 - 1 - j));
 2630|  14.6k|      }
 2631|   126k|    } else {
 2632|   287k|      for (int j = 0; j < buf_size_w_div8; ++j) {
  ------------------
  |  Branch (2632:23): [True: 161k, False: 126k]
  ------------------
 2633|   161k|        transpose_16bit_8x8(buf0 + 8 * j, _buf1 + 8 * j);
 2634|   161k|      }
 2635|   126k|    }
 2636|       |
 2637|   312k|    for (int j = 0; j < buf_size_w_div8; ++j) {
  ------------------
  |  Branch (2637:21): [True: 175k, False: 136k]
  ------------------
 2638|   175k|      iidentity_col_8xn_ssse3(output + i * 8 * stride + j * 8, stride,
 2639|   175k|                              buf1 + j * 8, shift[1], 8, txh_idx);
 2640|   175k|    }
 2641|   136k|  }
 2642|   123k|}
av1_lowbd_inv_txfm2d_add_ssse3:
 2865|  3.48M|                                    TX_SIZE tx_size, int eob) {
 2866|  3.48M|  switch (tx_size) {
 2867|   588k|    case TX_4X4:
  ------------------
  |  Branch (2867:5): [True: 588k, False: 2.89M]
  ------------------
 2868|   588k|      lowbd_inv_txfm2d_add_4x4_ssse3(input, output, stride, tx_type, tx_size,
 2869|   588k|                                     eob);
 2870|   588k|      break;
 2871|   399k|    case TX_4X8:
  ------------------
  |  Branch (2871:5): [True: 399k, False: 3.08M]
  ------------------
 2872|   399k|      lowbd_inv_txfm2d_add_4x8_ssse3(input, output, stride, tx_type, tx_size,
 2873|   399k|                                     eob);
 2874|   399k|      break;
 2875|   543k|    case TX_8X4:
  ------------------
  |  Branch (2875:5): [True: 543k, False: 2.94M]
  ------------------
 2876|   543k|      lowbd_inv_txfm2d_add_8x4_ssse3(input, output, stride, tx_type, tx_size,
 2877|   543k|                                     eob);
 2878|   543k|      break;
 2879|   256k|    case TX_4X16:
  ------------------
  |  Branch (2879:5): [True: 256k, False: 3.23M]
  ------------------
 2880|   256k|      lowbd_inv_txfm2d_add_4x16_ssse3(input, output, stride, tx_type, tx_size,
 2881|   256k|                                      eob);
 2882|   256k|      break;
 2883|   515k|    case TX_16X4:
  ------------------
  |  Branch (2883:5): [True: 515k, False: 2.97M]
  ------------------
 2884|   515k|      lowbd_inv_txfm2d_add_16x4_ssse3(input, output, stride, tx_type, tx_size,
 2885|   515k|                                      eob);
 2886|   515k|      break;
 2887|  1.18M|    default:
  ------------------
  |  Branch (2887:5): [True: 1.18M, False: 2.30M]
  ------------------
 2888|  1.18M|      lowbd_inv_txfm2d_add_universe_ssse3(input, output, stride, tx_type,
 2889|  1.18M|                                          tx_size, eob);
 2890|  1.18M|      break;
 2891|  3.48M|  }
 2892|  3.48M|}
av1_inv_txfm_ssse3.c:iidentity_row_8xn_ssse3:
 2318|   362k|                                           int txw_idx, int rect_type) {
 2319|   362k|  const int32_t *input_row = input;
 2320|   362k|  const __m128i scale = _mm_set1_epi16(NewSqrt2list[txw_idx]);
 2321|   362k|  const __m128i rounding = _mm_set1_epi16((1 << (NewSqrt2Bits - 1)) +
  ------------------
  |  |   41|   362k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2322|   362k|                                          (1 << (NewSqrt2Bits - shift - 1)));
  ------------------
  |  |   41|   362k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2323|   362k|  const __m128i one = _mm_set1_epi16(1);
 2324|   362k|  const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding);
 2325|   362k|  if (rect_type != 1 && rect_type != -1) {
  ------------------
  |  Branch (2325:7): [True: 258k, False: 103k]
  |  Branch (2325:25): [True: 192k, False: 66.3k]
  ------------------
 2326|  1.73M|    for (int i = 0; i < height; ++i) {
  ------------------
  |  Branch (2326:21): [True: 1.53M, False: 192k]
  ------------------
 2327|  1.53M|      const __m128i src = load_32bit_to_16bit(input_row);
 2328|  1.53M|      input_row += stride;
 2329|  1.53M|      __m128i lo = _mm_unpacklo_epi16(src, one);
 2330|  1.53M|      __m128i hi = _mm_unpackhi_epi16(src, one);
 2331|  1.53M|      lo = _mm_madd_epi16(lo, scale_rounding);
 2332|  1.53M|      hi = _mm_madd_epi16(hi, scale_rounding);
 2333|  1.53M|      lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
  ------------------
  |  |   41|  1.53M|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2334|  1.53M|      hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
  ------------------
  |  |   41|  1.53M|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2335|  1.53M|      out[i] = _mm_packs_epi32(lo, hi);
 2336|  1.53M|    }
 2337|   192k|  } else {
 2338|   170k|    const __m128i rect_scale =
 2339|   170k|        _mm_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
  ------------------
  |  |   41|   170k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2340|  1.53M|    for (int i = 0; i < height; ++i) {
  ------------------
  |  Branch (2340:21): [True: 1.36M, False: 170k]
  ------------------
 2341|  1.36M|      __m128i src = load_32bit_to_16bit(input_row);
 2342|  1.36M|      src = _mm_mulhrs_epi16(src, rect_scale);
 2343|  1.36M|      input_row += stride;
 2344|  1.36M|      __m128i lo = _mm_unpacklo_epi16(src, one);
 2345|  1.36M|      __m128i hi = _mm_unpackhi_epi16(src, one);
 2346|  1.36M|      lo = _mm_madd_epi16(lo, scale_rounding);
 2347|  1.36M|      hi = _mm_madd_epi16(hi, scale_rounding);
 2348|  1.36M|      lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
  ------------------
  |  |   41|  1.36M|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2349|  1.36M|      hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
  ------------------
  |  |   41|  1.36M|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2350|  1.36M|      out[i] = _mm_packs_epi32(lo, hi);
 2351|  1.36M|    }
 2352|   170k|  }
 2353|   362k|}
av1_inv_txfm_ssse3.c:iidentity_col_8xn_ssse3:
 2357|   468k|                                           int txh_idx) {
 2358|   468k|  const __m128i scale = _mm_set1_epi16(NewSqrt2list[txh_idx]);
 2359|   468k|  const __m128i scale_rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1));
  ------------------
  |  |   41|   468k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2360|   468k|  const __m128i shift_rounding = _mm_set1_epi32(1 << (-shift - 1));
 2361|   468k|  const __m128i one = _mm_set1_epi16(1);
 2362|   468k|  const __m128i scale_coeff = _mm_unpacklo_epi16(scale, scale_rounding);
 2363|   468k|  const __m128i zero = _mm_setzero_si128();
 2364|  4.21M|  for (int h = 0; h < height; ++h) {
  ------------------
  |  Branch (2364:19): [True: 3.74M, False: 468k]
  ------------------
 2365|  3.74M|    __m128i lo = _mm_unpacklo_epi16(buf[h], one);
 2366|  3.74M|    __m128i hi = _mm_unpackhi_epi16(buf[h], one);
 2367|  3.74M|    lo = _mm_madd_epi16(lo, scale_coeff);
 2368|  3.74M|    hi = _mm_madd_epi16(hi, scale_coeff);
 2369|  3.74M|    lo = _mm_srai_epi32(lo, NewSqrt2Bits);
  ------------------
  |  |   41|  3.74M|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2370|  3.74M|    hi = _mm_srai_epi32(hi, NewSqrt2Bits);
  ------------------
  |  |   41|  3.74M|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2371|  3.74M|    lo = _mm_add_epi32(lo, shift_rounding);
 2372|  3.74M|    hi = _mm_add_epi32(hi, shift_rounding);
 2373|  3.74M|    lo = _mm_srai_epi32(lo, -shift);
 2374|  3.74M|    hi = _mm_srai_epi32(hi, -shift);
 2375|  3.74M|    __m128i x = _mm_packs_epi32(lo, hi);
 2376|       |
 2377|  3.74M|    const __m128i pred = _mm_loadl_epi64((__m128i const *)(output));
 2378|  3.74M|    x = _mm_adds_epi16(x, _mm_unpacklo_epi8(pred, zero));
 2379|  3.74M|    const __m128i u = _mm_packus_epi16(x, x);
 2380|  3.74M|    _mm_storel_epi64((__m128i *)(output), u);
 2381|  3.74M|    output += stride;
 2382|  3.74M|  }
 2383|   468k|}
av1_inv_txfm_ssse3.c:idct4_sse2:
   27|  1.34M|static void idct4_sse2(const __m128i *input, __m128i *output) {
   28|  1.34M|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|  1.34M|#define INV_COS_BIT 12
  ------------------
   29|  1.34M|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  1.34M|#define INV_COS_BIT 12
  ------------------
   30|  1.34M|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  1.34M|#define INV_COS_BIT 12
  ------------------
   31|       |
   32|  1.34M|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|  1.34M|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
   33|  1.34M|  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
  ------------------
  |  |   20|  1.34M|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
   34|  1.34M|  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
  ------------------
  |  |   20|  1.34M|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
   35|  1.34M|  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
  ------------------
  |  |   20|  1.34M|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
   36|       |
   37|       |  // stage 1
   38|  1.34M|  __m128i x[4];
   39|  1.34M|  x[0] = input[0];
   40|  1.34M|  x[1] = input[2];
   41|  1.34M|  x[2] = input[1];
   42|  1.34M|  x[3] = input[3];
   43|       |
   44|       |  // stage 2
   45|  1.34M|  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
  ------------------
  |  |   61|  1.34M|  do {                                            \
  |  |   62|  1.34M|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  1.34M|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  1.34M|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  1.34M|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  1.34M|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  1.34M|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  1.34M|                                                  \
  |  |   69|  1.34M|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  1.34M|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  1.34M|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  1.34M|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  1.34M|                                                  \
  |  |   74|  1.34M|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  1.34M|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  1.34M|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  1.34M|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  1.34M|                                                  \
  |  |   79|  1.34M|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  1.34M|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  1.34M|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
   46|  1.34M|  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   61|  1.34M|  do {                                            \
  |  |   62|  1.34M|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  1.34M|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  1.34M|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  1.34M|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  1.34M|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  1.34M|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  1.34M|                                                  \
  |  |   69|  1.34M|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  1.34M|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  1.34M|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  1.34M|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  1.34M|                                                  \
  |  |   74|  1.34M|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  1.34M|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  1.34M|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  1.34M|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  1.34M|                                                  \
  |  |   79|  1.34M|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  1.34M|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  1.34M|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
   47|       |
   48|       |  // stage 3
   49|  1.34M|  btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
  ------------------
  |  |   53|  1.34M|  do {                                                  \
  |  |   54|  1.34M|    const __m128i _in0 = in0;                           \
  |  |   55|  1.34M|    const __m128i _in1 = in1;                           \
  |  |   56|  1.34M|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  1.34M|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  1.34M|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
   50|  1.34M|  btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
  ------------------
  |  |   53|  1.34M|  do {                                                  \
  |  |   54|  1.34M|    const __m128i _in0 = in0;                           \
  |  |   55|  1.34M|    const __m128i _in1 = in1;                           \
  |  |   56|  1.34M|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|  1.34M|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|  1.34M|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
   51|  1.34M|}
av1_inv_txfm_ssse3.c:iadst4_sse2:
 1597|   705k|static void iadst4_sse2(const __m128i *input, __m128i *output) {
 1598|   705k|  const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   705k|#define INV_COS_BIT 12
  ------------------
 1599|   705k|  const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
  ------------------
  |  |   20|   705k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1600|   705k|  const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
  ------------------
  |  |   20|   705k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1601|   705k|  const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
  ------------------
  |  |   20|   705k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1602|   705k|  const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
  ------------------
  |  |   20|   705k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1603|   705k|  const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
  ------------------
  |  |   20|   705k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1604|   705k|  const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
  ------------------
  |  |   20|   705k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1605|   705k|  const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
  ------------------
  |  |   20|   705k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1606|   705k|  const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
  ------------------
  |  |   20|   705k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1607|   705k|  __m128i x0[4];
 1608|   705k|  x0[0] = input[0];
 1609|   705k|  x0[1] = input[1];
 1610|   705k|  x0[2] = input[2];
 1611|   705k|  x0[3] = input[3];
 1612|       |
 1613|   705k|  __m128i u[4];
 1614|   705k|  u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
 1615|   705k|  u[1] = _mm_unpackhi_epi16(x0[0], x0[2]);
 1616|   705k|  u[2] = _mm_unpacklo_epi16(x0[1], x0[3]);
 1617|   705k|  u[3] = _mm_unpackhi_epi16(x0[1], x0[3]);
 1618|       |
 1619|   705k|  __m128i x1[16];
 1620|   705k|  x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04);  // x0*sin1 + x2*sin4
 1621|   705k|  x1[1] = _mm_madd_epi16(u[1], sinpi_p01_p04);
 1622|   705k|  x1[2] = _mm_madd_epi16(u[0], sinpi_p02_m01);  // x0*sin2 - x2*sin1
 1623|   705k|  x1[3] = _mm_madd_epi16(u[1], sinpi_p02_m01);
 1624|   705k|  x1[4] = _mm_madd_epi16(u[2], sinpi_p03_p02);  // x1*sin3 + x3*sin2
 1625|   705k|  x1[5] = _mm_madd_epi16(u[3], sinpi_p03_p02);
 1626|   705k|  x1[6] = _mm_madd_epi16(u[2], sinpi_p03_m04);  // x1*sin3 - x3*sin4
 1627|   705k|  x1[7] = _mm_madd_epi16(u[3], sinpi_p03_m04);
 1628|   705k|  x1[8] = _mm_madd_epi16(u[0], sinpi_p03_m03);  // x0*sin3 - x2*sin3
 1629|   705k|  x1[9] = _mm_madd_epi16(u[1], sinpi_p03_m03);
 1630|   705k|  x1[10] = _mm_madd_epi16(u[2], sinpi_0_p03);  // x2*sin3
 1631|   705k|  x1[11] = _mm_madd_epi16(u[3], sinpi_0_p03);
 1632|   705k|  x1[12] = _mm_madd_epi16(u[0], sinpi_p04_p02);  // x0*sin4 + x2*sin2
 1633|   705k|  x1[13] = _mm_madd_epi16(u[1], sinpi_p04_p02);
 1634|   705k|  x1[14] = _mm_madd_epi16(u[2], sinpi_m03_m01);  // -x1*sin3 - x3*sin1
 1635|   705k|  x1[15] = _mm_madd_epi16(u[3], sinpi_m03_m01);
 1636|       |
 1637|   705k|  __m128i x2[8];
 1638|   705k|  x2[0] = _mm_add_epi32(x1[0], x1[4]);  // x0*sin1 +x2*sin4 +x1*sin3 +x3*sin2
 1639|   705k|  x2[1] = _mm_add_epi32(x1[1], x1[5]);
 1640|   705k|  x2[2] = _mm_add_epi32(x1[2], x1[6]);  // x0*sin2 -x2*sin1 +x1*sin3 -x3*sin4
 1641|   705k|  x2[3] = _mm_add_epi32(x1[3], x1[7]);
 1642|   705k|  x2[4] = _mm_add_epi32(x1[8], x1[10]);  // x0*sin3 -x2*sin3 +x3*sin3
 1643|   705k|  x2[5] = _mm_add_epi32(x1[9], x1[11]);
 1644|   705k|  x2[6] = _mm_add_epi32(x1[12], x1[14]);  // x0*sin1 +x2*sin4 +x0*sin2 -x2*sin1
 1645|   705k|  x2[7] = _mm_add_epi32(x1[13], x1[15]);
 1646|       |
 1647|   705k|  const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|   705k|#define INV_COS_BIT 12
  ------------------
 1648|  3.52M|  for (int i = 0; i < 4; ++i) {
  ------------------
  |  Branch (1648:19): [True: 2.82M, False: 705k]
  ------------------
 1649|  2.82M|    __m128i out0 = _mm_add_epi32(x2[2 * i], rounding);
 1650|  2.82M|    __m128i out1 = _mm_add_epi32(x2[2 * i + 1], rounding);
 1651|  2.82M|    out0 = _mm_srai_epi32(out0, INV_COS_BIT);
  ------------------
  |  |   43|  2.82M|#define INV_COS_BIT 12
  ------------------
 1652|  2.82M|    out1 = _mm_srai_epi32(out1, INV_COS_BIT);
  ------------------
  |  |   43|  2.82M|#define INV_COS_BIT 12
  ------------------
 1653|  2.82M|    output[i] = _mm_packs_epi32(out0, out1);
 1654|  2.82M|  }
 1655|   705k|}
av1_inv_txfm_ssse3.c:iidentity4_ssse3:
 2210|   579k|static void iidentity4_ssse3(const __m128i *input, __m128i *output) {
 2211|   579k|  const int16_t scale_fractional = (NewSqrt2 - (1 << NewSqrt2Bits));
  ------------------
  |  |   41|   579k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2212|   579k|  const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
  ------------------
  |  |   41|   579k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2213|  2.89M|  for (int i = 0; i < 4; ++i) {
  ------------------
  |  Branch (2213:19): [True: 2.31M, False: 579k]
  ------------------
 2214|  2.31M|    __m128i x = _mm_mulhrs_epi16(input[i], scale);
 2215|  2.31M|    output[i] = _mm_adds_epi16(x, input[i]);
 2216|  2.31M|  }
 2217|   579k|}
av1_inv_txfm_ssse3.c:iidentity8_sse2:
 2219|   167k|static void iidentity8_sse2(const __m128i *input, __m128i *output) {
 2220|  1.50M|  for (int i = 0; i < 8; ++i) {
  ------------------
  |  Branch (2220:19): [True: 1.34M, False: 167k]
  ------------------
 2221|  1.34M|    output[i] = _mm_adds_epi16(input[i], input[i]);
 2222|  1.34M|  }
 2223|   167k|}
av1_inv_txfm_ssse3.c:idct16_low1_ssse3:
  236|   146k|static void idct16_low1_ssse3(const __m128i *input, __m128i *output) {
  237|   146k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   146k|#define INV_COS_BIT 12
  ------------------
  238|       |
  239|       |  // stage 1
  240|   146k|  __m128i x[2];
  241|   146k|  x[0] = input[0];
  242|       |
  243|       |  // stage 2
  244|       |  // stage 3
  245|       |  // stage 4
  246|   146k|  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
  ------------------
  |  |   28|   146k|  do {                                          \
  |  |   29|   146k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|   146k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|   146k|    const __m128i _in = in;                     \
  |  |   32|   146k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|   146k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|   146k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  247|       |
  248|       |  // stage 5
  249|       |  // stage 6
  250|       |  // stage 7
  251|   146k|  output[0] = x[0];
  252|   146k|  output[15] = x[0];
  253|   146k|  output[1] = x[1];
  254|   146k|  output[14] = x[1];
  255|   146k|  output[2] = x[1];
  256|   146k|  output[13] = x[1];
  257|   146k|  output[3] = x[0];
  258|   146k|  output[12] = x[0];
  259|   146k|  output[4] = x[0];
  260|   146k|  output[11] = x[0];
  261|   146k|  output[5] = x[1];
  262|   146k|  output[10] = x[1];
  263|   146k|  output[6] = x[1];
  264|   146k|  output[9] = x[1];
  265|   146k|  output[7] = x[0];
  266|   146k|  output[8] = x[0];
  267|   146k|}
av1_inv_txfm_ssse3.c:idct16_low8_ssse3:
  269|   177k|static void idct16_low8_ssse3(const __m128i *input, __m128i *output) {
  270|   177k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|   177k|#define INV_COS_BIT 12
  ------------------
  271|   177k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   177k|#define INV_COS_BIT 12
  ------------------
  272|   177k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|   177k|#define INV_COS_BIT 12
  ------------------
  273|   177k|  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
  ------------------
  |  |   20|   177k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  274|   177k|  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
  ------------------
  |  |   20|   177k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  275|   177k|  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
  ------------------
  |  |   20|   177k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  276|       |
  277|       |  // stage 1
  278|   177k|  __m128i x[16];
  279|   177k|  x[0] = input[0];
  280|   177k|  x[2] = input[4];
  281|   177k|  x[4] = input[2];
  282|   177k|  x[6] = input[6];
  283|   177k|  x[8] = input[1];
  284|   177k|  x[10] = input[5];
  285|   177k|  x[12] = input[3];
  286|   177k|  x[14] = input[7];
  287|       |
  288|       |  // stage 2
  289|   177k|  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
  ------------------
  |  |   28|   177k|  do {                                          \
  |  |   29|   177k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|   177k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|   177k|    const __m128i _in = in;                     \
  |  |   32|   177k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|   177k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|   177k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  290|   177k|  btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
  ------------------
  |  |   28|   177k|  do {                                          \
  |  |   29|   177k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|   177k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|   177k|    const __m128i _in = in;                     \
  |  |   32|   177k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|   177k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|   177k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  291|   177k|  btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
  ------------------
  |  |   28|   177k|  do {                                          \
  |  |   29|   177k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|   177k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|   177k|    const __m128i _in = in;                     \
  |  |   32|   177k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|   177k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|   177k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  292|   177k|  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
  ------------------
  |  |   28|   177k|  do {                                          \
  |  |   29|   177k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|   177k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|   177k|    const __m128i _in = in;                     \
  |  |   32|   177k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|   177k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|   177k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  293|       |
  294|       |  // stage 3
  295|   177k|  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
  ------------------
  |  |   28|   177k|  do {                                          \
  |  |   29|   177k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|   177k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|   177k|    const __m128i _in = in;                     \
  |  |   32|   177k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|   177k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|   177k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  296|   177k|  btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
  ------------------
  |  |   28|   177k|  do {                                          \
  |  |   29|   177k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|   177k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|   177k|    const __m128i _in = in;                     \
  |  |   32|   177k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|   177k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|   177k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  297|   177k|  btf_16_adds_subs_sse2(x[8], x[9]);
  ------------------
  |  |   37|   177k|  do {                                  \
  |  |   38|   177k|    const __m128i _in0 = in0;           \
  |  |   39|   177k|    const __m128i _in1 = in1;           \
  |  |   40|   177k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   177k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   177k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  298|   177k|  btf_16_subs_adds_sse2(x[11], x[10]);
  ------------------
  |  |   45|   177k|  do {                                  \
  |  |   46|   177k|    const __m128i _in0 = in0;           \
  |  |   47|   177k|    const __m128i _in1 = in1;           \
  |  |   48|   177k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|   177k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|   177k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  299|   177k|  btf_16_adds_subs_sse2(x[12], x[13]);
  ------------------
  |  |   37|   177k|  do {                                  \
  |  |   38|   177k|    const __m128i _in0 = in0;           \
  |  |   39|   177k|    const __m128i _in1 = in1;           \
  |  |   40|   177k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   177k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   177k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  300|   177k|  btf_16_subs_adds_sse2(x[15], x[14]);
  ------------------
  |  |   45|   177k|  do {                                  \
  |  |   46|   177k|    const __m128i _in0 = in0;           \
  |  |   47|   177k|    const __m128i _in1 = in1;           \
  |  |   48|   177k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|   177k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|   177k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  301|       |
  302|       |  // stage 4
  303|   177k|  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
  ------------------
  |  |   28|   177k|  do {                                          \
  |  |   29|   177k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|   177k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|   177k|    const __m128i _in = in;                     \
  |  |   32|   177k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|   177k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|   177k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  304|   177k|  btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
  ------------------
  |  |   28|   177k|  do {                                          \
  |  |   29|   177k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|   177k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|   177k|    const __m128i _in = in;                     \
  |  |   32|   177k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|   177k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|   177k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  305|   177k|  btf_16_adds_subs_sse2(x[4], x[5]);
  ------------------
  |  |   37|   177k|  do {                                  \
  |  |   38|   177k|    const __m128i _in0 = in0;           \
  |  |   39|   177k|    const __m128i _in1 = in1;           \
  |  |   40|   177k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   177k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   177k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  306|   177k|  btf_16_subs_adds_sse2(x[7], x[6]);
  ------------------
  |  |   45|   177k|  do {                                  \
  |  |   46|   177k|    const __m128i _in0 = in0;           \
  |  |   47|   177k|    const __m128i _in1 = in1;           \
  |  |   48|   177k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|   177k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|   177k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  307|   177k|  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
  ------------------
  |  |   61|   177k|  do {                                            \
  |  |   62|   177k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   177k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   177k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   177k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   177k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   177k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   177k|                                                  \
  |  |   69|   177k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   177k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   177k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   177k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   177k|                                                  \
  |  |   74|   177k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   177k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   177k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   177k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   177k|                                                  \
  |  |   79|   177k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   177k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   177k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  308|   177k|  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
  ------------------
  |  |   61|   177k|  do {                                            \
  |  |   62|   177k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   177k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   177k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   177k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   177k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   177k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   177k|                                                  \
  |  |   69|   177k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   177k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   177k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   177k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   177k|                                                  \
  |  |   74|   177k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   177k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   177k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   177k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   177k|                                                  \
  |  |   79|   177k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   177k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   177k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  309|       |
  310|   177k|  idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
  311|   177k|  idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
  312|   177k|  idct16_stage7_sse2(output, x);
  313|   177k|}
av1_inv_txfm_ssse3.c:idct16_stage5_sse2:
  200|   230k|                                      int8_t cos_bit) {
  201|   230k|  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
  ------------------
  |  |   20|   230k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  202|   230k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|   230k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  203|   230k|  btf_16_adds_subs_sse2(x[0], x[3]);
  ------------------
  |  |   37|   230k|  do {                                  \
  |  |   38|   230k|    const __m128i _in0 = in0;           \
  |  |   39|   230k|    const __m128i _in1 = in1;           \
  |  |   40|   230k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   230k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   230k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  204|   230k|  btf_16_adds_subs_sse2(x[1], x[2]);
  ------------------
  |  |   37|   230k|  do {                                  \
  |  |   38|   230k|    const __m128i _in0 = in0;           \
  |  |   39|   230k|    const __m128i _in1 = in1;           \
  |  |   40|   230k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   230k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   230k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  205|   230k|  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
  ------------------
  |  |   61|   230k|  do {                                            \
  |  |   62|   230k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   230k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   230k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   230k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   230k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   230k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   230k|                                                  \
  |  |   69|   230k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   230k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   230k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   230k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   230k|                                                  \
  |  |   74|   230k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   230k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   230k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   230k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   230k|                                                  \
  |  |   79|   230k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   230k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   230k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  206|   230k|  btf_16_adds_subs_sse2(x[8], x[11]);
  ------------------
  |  |   37|   230k|  do {                                  \
  |  |   38|   230k|    const __m128i _in0 = in0;           \
  |  |   39|   230k|    const __m128i _in1 = in1;           \
  |  |   40|   230k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   230k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   230k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  207|   230k|  btf_16_adds_subs_sse2(x[9], x[10]);
  ------------------
  |  |   37|   230k|  do {                                  \
  |  |   38|   230k|    const __m128i _in0 = in0;           \
  |  |   39|   230k|    const __m128i _in1 = in1;           \
  |  |   40|   230k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   230k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   230k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  208|   230k|  btf_16_subs_adds_sse2(x[15], x[12]);
  ------------------
  |  |   45|   230k|  do {                                  \
  |  |   46|   230k|    const __m128i _in0 = in0;           \
  |  |   47|   230k|    const __m128i _in1 = in1;           \
  |  |   48|   230k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|   230k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|   230k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  209|   230k|  btf_16_subs_adds_sse2(x[14], x[13]);
  ------------------
  |  |   45|   230k|  do {                                  \
  |  |   46|   230k|    const __m128i _in0 = in0;           \
  |  |   47|   230k|    const __m128i _in1 = in1;           \
  |  |   48|   230k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|   230k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|   230k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  210|   230k|}
av1_inv_txfm_ssse3.c:idct16_stage6_sse2:
  214|   230k|                                      int8_t cos_bit) {
  215|   230k|  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
  ------------------
  |  |   20|   230k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  216|   230k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|   230k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  217|   230k|  btf_16_adds_subs_sse2(x[0], x[7]);
  ------------------
  |  |   37|   230k|  do {                                  \
  |  |   38|   230k|    const __m128i _in0 = in0;           \
  |  |   39|   230k|    const __m128i _in1 = in1;           \
  |  |   40|   230k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   230k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   230k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  218|   230k|  btf_16_adds_subs_sse2(x[1], x[6]);
  ------------------
  |  |   37|   230k|  do {                                  \
  |  |   38|   230k|    const __m128i _in0 = in0;           \
  |  |   39|   230k|    const __m128i _in1 = in1;           \
  |  |   40|   230k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   230k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   230k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  219|   230k|  btf_16_adds_subs_sse2(x[2], x[5]);
  ------------------
  |  |   37|   230k|  do {                                  \
  |  |   38|   230k|    const __m128i _in0 = in0;           \
  |  |   39|   230k|    const __m128i _in1 = in1;           \
  |  |   40|   230k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   230k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   230k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  220|   230k|  btf_16_adds_subs_sse2(x[3], x[4]);
  ------------------
  |  |   37|   230k|  do {                                  \
  |  |   38|   230k|    const __m128i _in0 = in0;           \
  |  |   39|   230k|    const __m128i _in1 = in1;           \
  |  |   40|   230k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   230k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   230k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  221|   230k|  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
  ------------------
  |  |   61|   230k|  do {                                            \
  |  |   62|   230k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   230k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   230k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   230k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   230k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   230k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   230k|                                                  \
  |  |   69|   230k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   230k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   230k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   230k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   230k|                                                  \
  |  |   74|   230k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   230k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   230k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   230k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   230k|                                                  \
  |  |   79|   230k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   230k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   230k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  222|   230k|  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
  ------------------
  |  |   61|   230k|  do {                                            \
  |  |   62|   230k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   230k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   230k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   230k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   230k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   230k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   230k|                                                  \
  |  |   69|   230k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   230k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   230k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   230k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   230k|                                                  \
  |  |   74|   230k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   230k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   230k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   230k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   230k|                                                  \
  |  |   79|   230k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   230k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   230k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  223|   230k|}
av1_inv_txfm_ssse3.c:idct16_stage7_sse2:
  225|   633k|static inline void idct16_stage7_sse2(__m128i *output, __m128i *x) {
  226|   633k|  btf_16_adds_subs_out_sse2(output[0], output[15], x[0], x[15]);
  ------------------
  |  |   53|   633k|  do {                                                  \
  |  |   54|   633k|    const __m128i _in0 = in0;                           \
  |  |   55|   633k|    const __m128i _in1 = in1;                           \
  |  |   56|   633k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   633k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   633k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  227|   633k|  btf_16_adds_subs_out_sse2(output[1], output[14], x[1], x[14]);
  ------------------
  |  |   53|   633k|  do {                                                  \
  |  |   54|   633k|    const __m128i _in0 = in0;                           \
  |  |   55|   633k|    const __m128i _in1 = in1;                           \
  |  |   56|   633k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   633k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   633k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  228|   633k|  btf_16_adds_subs_out_sse2(output[2], output[13], x[2], x[13]);
  ------------------
  |  |   53|   633k|  do {                                                  \
  |  |   54|   633k|    const __m128i _in0 = in0;                           \
  |  |   55|   633k|    const __m128i _in1 = in1;                           \
  |  |   56|   633k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   633k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   633k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  229|   633k|  btf_16_adds_subs_out_sse2(output[3], output[12], x[3], x[12]);
  ------------------
  |  |   53|   633k|  do {                                                  \
  |  |   54|   633k|    const __m128i _in0 = in0;                           \
  |  |   55|   633k|    const __m128i _in1 = in1;                           \
  |  |   56|   633k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   633k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   633k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  230|   633k|  btf_16_adds_subs_out_sse2(output[4], output[11], x[4], x[11]);
  ------------------
  |  |   53|   633k|  do {                                                  \
  |  |   54|   633k|    const __m128i _in0 = in0;                           \
  |  |   55|   633k|    const __m128i _in1 = in1;                           \
  |  |   56|   633k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   633k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   633k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  231|   633k|  btf_16_adds_subs_out_sse2(output[5], output[10], x[5], x[10]);
  ------------------
  |  |   53|   633k|  do {                                                  \
  |  |   54|   633k|    const __m128i _in0 = in0;                           \
  |  |   55|   633k|    const __m128i _in1 = in1;                           \
  |  |   56|   633k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   633k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   633k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  232|   633k|  btf_16_adds_subs_out_sse2(output[6], output[9], x[6], x[9]);
  ------------------
  |  |   53|   633k|  do {                                                  \
  |  |   54|   633k|    const __m128i _in0 = in0;                           \
  |  |   55|   633k|    const __m128i _in1 = in1;                           \
  |  |   56|   633k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   633k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   633k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  233|   633k|  btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]);
  ------------------
  |  |   53|   633k|  do {                                                  \
  |  |   54|   633k|    const __m128i _in0 = in0;                           \
  |  |   55|   633k|    const __m128i _in1 = in1;                           \
  |  |   56|   633k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   633k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   633k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  234|   633k|}
av1_inv_txfm_ssse3.c:idct16_sse2:
  315|  53.8k|static void idct16_sse2(const __m128i *input, __m128i *output) {
  316|  53.8k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|  53.8k|#define INV_COS_BIT 12
  ------------------
  317|  53.8k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  53.8k|#define INV_COS_BIT 12
  ------------------
  318|  53.8k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  53.8k|#define INV_COS_BIT 12
  ------------------
  319|       |
  320|  53.8k|  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
  ------------------
  |  |   20|  53.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  321|  53.8k|  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
  ------------------
  |  |   20|  53.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  322|  53.8k|  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
  ------------------
  |  |   20|  53.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  323|  53.8k|  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
  ------------------
  |  |   20|  53.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  324|  53.8k|  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
  ------------------
  |  |   20|  53.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  325|  53.8k|  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
  ------------------
  |  |   20|  53.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  326|  53.8k|  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
  ------------------
  |  |   20|  53.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  327|  53.8k|  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
  ------------------
  |  |   20|  53.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  328|  53.8k|  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
  ------------------
  |  |   20|  53.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  329|  53.8k|  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
  ------------------
  |  |   20|  53.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  330|  53.8k|  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
  ------------------
  |  |   20|  53.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  331|  53.8k|  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
  ------------------
  |  |   20|  53.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  332|  53.8k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|  53.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  333|  53.8k|  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
  ------------------
  |  |   20|  53.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  334|  53.8k|  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
  ------------------
  |  |   20|  53.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  335|  53.8k|  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
  ------------------
  |  |   20|  53.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  336|  53.8k|  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
  ------------------
  |  |   20|  53.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  337|  53.8k|  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
  ------------------
  |  |   20|  53.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  338|  53.8k|  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
  ------------------
  |  |   20|  53.8k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  339|       |
  340|       |  // stage 1
  341|  53.8k|  __m128i x[16];
  342|  53.8k|  x[0] = input[0];
  343|  53.8k|  x[1] = input[8];
  344|  53.8k|  x[2] = input[4];
  345|  53.8k|  x[3] = input[12];
  346|  53.8k|  x[4] = input[2];
  347|  53.8k|  x[5] = input[10];
  348|  53.8k|  x[6] = input[6];
  349|  53.8k|  x[7] = input[14];
  350|  53.8k|  x[8] = input[1];
  351|  53.8k|  x[9] = input[9];
  352|  53.8k|  x[10] = input[5];
  353|  53.8k|  x[11] = input[13];
  354|  53.8k|  x[12] = input[3];
  355|  53.8k|  x[13] = input[11];
  356|  53.8k|  x[14] = input[7];
  357|  53.8k|  x[15] = input[15];
  358|       |
  359|       |  // stage 2
  360|  53.8k|  btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
  ------------------
  |  |   61|  53.8k|  do {                                            \
  |  |   62|  53.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  53.8k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  53.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  53.8k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  53.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  53.8k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  53.8k|                                                  \
  |  |   69|  53.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  53.8k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  53.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  53.8k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  53.8k|                                                  \
  |  |   74|  53.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  53.8k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  53.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  53.8k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  53.8k|                                                  \
  |  |   79|  53.8k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  53.8k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  53.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  361|  53.8k|  btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
  ------------------
  |  |   61|  53.8k|  do {                                            \
  |  |   62|  53.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  53.8k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  53.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  53.8k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  53.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  53.8k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  53.8k|                                                  \
  |  |   69|  53.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  53.8k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  53.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  53.8k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  53.8k|                                                  \
  |  |   74|  53.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  53.8k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  53.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  53.8k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  53.8k|                                                  \
  |  |   79|  53.8k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  53.8k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  53.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  362|  53.8k|  btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
  ------------------
  |  |   61|  53.8k|  do {                                            \
  |  |   62|  53.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  53.8k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  53.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  53.8k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  53.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  53.8k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  53.8k|                                                  \
  |  |   69|  53.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  53.8k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  53.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  53.8k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  53.8k|                                                  \
  |  |   74|  53.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  53.8k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  53.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  53.8k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  53.8k|                                                  \
  |  |   79|  53.8k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  53.8k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  53.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  363|  53.8k|  btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
  ------------------
  |  |   61|  53.8k|  do {                                            \
  |  |   62|  53.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  53.8k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  53.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  53.8k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  53.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  53.8k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  53.8k|                                                  \
  |  |   69|  53.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  53.8k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  53.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  53.8k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  53.8k|                                                  \
  |  |   74|  53.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  53.8k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  53.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  53.8k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  53.8k|                                                  \
  |  |   79|  53.8k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  53.8k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  53.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  364|       |
  365|       |  // stage 3
  366|  53.8k|  btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
  ------------------
  |  |   61|  53.8k|  do {                                            \
  |  |   62|  53.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  53.8k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  53.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  53.8k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  53.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  53.8k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  53.8k|                                                  \
  |  |   69|  53.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  53.8k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  53.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  53.8k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  53.8k|                                                  \
  |  |   74|  53.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  53.8k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  53.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  53.8k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  53.8k|                                                  \
  |  |   79|  53.8k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  53.8k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  53.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  367|  53.8k|  btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
  ------------------
  |  |   61|  53.8k|  do {                                            \
  |  |   62|  53.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  53.8k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  53.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  53.8k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  53.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  53.8k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  53.8k|                                                  \
  |  |   69|  53.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  53.8k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  53.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  53.8k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  53.8k|                                                  \
  |  |   74|  53.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  53.8k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  53.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  53.8k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  53.8k|                                                  \
  |  |   79|  53.8k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  53.8k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  53.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  368|  53.8k|  btf_16_adds_subs_sse2(x[8], x[9]);
  ------------------
  |  |   37|  53.8k|  do {                                  \
  |  |   38|  53.8k|    const __m128i _in0 = in0;           \
  |  |   39|  53.8k|    const __m128i _in1 = in1;           \
  |  |   40|  53.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  53.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  53.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  369|  53.8k|  btf_16_subs_adds_sse2(x[11], x[10]);
  ------------------
  |  |   45|  53.8k|  do {                                  \
  |  |   46|  53.8k|    const __m128i _in0 = in0;           \
  |  |   47|  53.8k|    const __m128i _in1 = in1;           \
  |  |   48|  53.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  53.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  53.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  370|  53.8k|  btf_16_adds_subs_sse2(x[12], x[13]);
  ------------------
  |  |   37|  53.8k|  do {                                  \
  |  |   38|  53.8k|    const __m128i _in0 = in0;           \
  |  |   39|  53.8k|    const __m128i _in1 = in1;           \
  |  |   40|  53.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  53.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  53.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  371|  53.8k|  btf_16_subs_adds_sse2(x[15], x[14]);
  ------------------
  |  |   45|  53.8k|  do {                                  \
  |  |   46|  53.8k|    const __m128i _in0 = in0;           \
  |  |   47|  53.8k|    const __m128i _in1 = in1;           \
  |  |   48|  53.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  53.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  53.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  372|       |
  373|       |  // stage 4
  374|  53.8k|  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
  ------------------
  |  |   61|  53.8k|  do {                                            \
  |  |   62|  53.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  53.8k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  53.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  53.8k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  53.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  53.8k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  53.8k|                                                  \
  |  |   69|  53.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  53.8k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  53.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  53.8k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  53.8k|                                                  \
  |  |   74|  53.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  53.8k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  53.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  53.8k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  53.8k|                                                  \
  |  |   79|  53.8k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  53.8k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  53.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  375|  53.8k|  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   61|  53.8k|  do {                                            \
  |  |   62|  53.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  53.8k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  53.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  53.8k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  53.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  53.8k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  53.8k|                                                  \
  |  |   69|  53.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  53.8k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  53.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  53.8k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  53.8k|                                                  \
  |  |   74|  53.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  53.8k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  53.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  53.8k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  53.8k|                                                  \
  |  |   79|  53.8k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  53.8k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  53.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  376|  53.8k|  btf_16_adds_subs_sse2(x[4], x[5]);
  ------------------
  |  |   37|  53.8k|  do {                                  \
  |  |   38|  53.8k|    const __m128i _in0 = in0;           \
  |  |   39|  53.8k|    const __m128i _in1 = in1;           \
  |  |   40|  53.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  53.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  53.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  377|  53.8k|  btf_16_subs_adds_sse2(x[7], x[6]);
  ------------------
  |  |   45|  53.8k|  do {                                  \
  |  |   46|  53.8k|    const __m128i _in0 = in0;           \
  |  |   47|  53.8k|    const __m128i _in1 = in1;           \
  |  |   48|  53.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  53.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  53.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  378|  53.8k|  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
  ------------------
  |  |   61|  53.8k|  do {                                            \
  |  |   62|  53.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  53.8k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  53.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  53.8k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  53.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  53.8k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  53.8k|                                                  \
  |  |   69|  53.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  53.8k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  53.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  53.8k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  53.8k|                                                  \
  |  |   74|  53.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  53.8k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  53.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  53.8k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  53.8k|                                                  \
  |  |   79|  53.8k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  53.8k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  53.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  379|  53.8k|  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
  ------------------
  |  |   61|  53.8k|  do {                                            \
  |  |   62|  53.8k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  53.8k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  53.8k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  53.8k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  53.8k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  53.8k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  53.8k|                                                  \
  |  |   69|  53.8k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  53.8k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  53.8k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  53.8k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  53.8k|                                                  \
  |  |   74|  53.8k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  53.8k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  53.8k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  53.8k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  53.8k|                                                  \
  |  |   79|  53.8k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  53.8k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  53.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  380|       |
  381|       |  // stage 5~7
  382|  53.8k|  idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
  383|  53.8k|  idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
  384|  53.8k|  idct16_stage7_sse2(output, x);
  385|  53.8k|}
av1_inv_txfm_ssse3.c:iadst16_low1_ssse3:
 1974|  57.4k|static void iadst16_low1_ssse3(const __m128i *input, __m128i *output) {
 1975|  57.4k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|  57.4k|#define INV_COS_BIT 12
  ------------------
 1976|  57.4k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  57.4k|#define INV_COS_BIT 12
  ------------------
 1977|  57.4k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  57.4k|#define INV_COS_BIT 12
  ------------------
 1978|       |
 1979|  57.4k|  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
  ------------------
  |  |   20|  57.4k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1980|  57.4k|  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
  ------------------
  |  |   20|  57.4k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1981|  57.4k|  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
  ------------------
  |  |   20|  57.4k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1982|  57.4k|  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
  ------------------
  |  |   20|  57.4k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1983|       |
 1984|       |  // stage 1
 1985|  57.4k|  __m128i x[16];
 1986|  57.4k|  x[1] = input[0];
 1987|       |
 1988|       |  // stage 2
 1989|  57.4k|  btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
  ------------------
  |  |   28|  57.4k|  do {                                          \
  |  |   29|  57.4k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  57.4k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  57.4k|    const __m128i _in = in;                     \
  |  |   32|  57.4k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  57.4k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  57.4k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1990|       |
 1991|       |  // stage 3
 1992|  57.4k|  x[8] = x[0];
 1993|  57.4k|  x[9] = x[1];
 1994|       |
 1995|       |  // stage 4
 1996|  57.4k|  btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
  ------------------
  |  |   61|  57.4k|  do {                                            \
  |  |   62|  57.4k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  57.4k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  57.4k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  57.4k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  57.4k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  57.4k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  57.4k|                                                  \
  |  |   69|  57.4k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  57.4k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  57.4k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  57.4k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  57.4k|                                                  \
  |  |   74|  57.4k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  57.4k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  57.4k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  57.4k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  57.4k|                                                  \
  |  |   79|  57.4k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  57.4k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  57.4k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1997|       |
 1998|       |  // stage 5
 1999|  57.4k|  x[4] = x[0];
 2000|  57.4k|  x[5] = x[1];
 2001|  57.4k|  x[12] = x[8];
 2002|  57.4k|  x[13] = x[9];
 2003|       |
 2004|       |  // stage 6
 2005|  57.4k|  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
  ------------------
  |  |   61|  57.4k|  do {                                            \
  |  |   62|  57.4k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  57.4k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  57.4k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  57.4k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  57.4k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  57.4k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  57.4k|                                                  \
  |  |   69|  57.4k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  57.4k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  57.4k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  57.4k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  57.4k|                                                  \
  |  |   74|  57.4k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  57.4k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  57.4k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  57.4k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  57.4k|                                                  \
  |  |   79|  57.4k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  57.4k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  57.4k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2006|  57.4k|  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
  ------------------
  |  |   61|  57.4k|  do {                                            \
  |  |   62|  57.4k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  57.4k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  57.4k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  57.4k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  57.4k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  57.4k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  57.4k|                                                  \
  |  |   69|  57.4k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  57.4k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  57.4k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  57.4k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  57.4k|                                                  \
  |  |   74|  57.4k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  57.4k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  57.4k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  57.4k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  57.4k|                                                  \
  |  |   79|  57.4k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  57.4k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  57.4k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2007|       |
 2008|       |  // stage 7
 2009|  57.4k|  x[2] = x[0];
 2010|  57.4k|  x[3] = x[1];
 2011|  57.4k|  x[6] = x[4];
 2012|  57.4k|  x[7] = x[5];
 2013|  57.4k|  x[10] = x[8];
 2014|  57.4k|  x[11] = x[9];
 2015|  57.4k|  x[14] = x[12];
 2016|  57.4k|  x[15] = x[13];
 2017|       |
 2018|  57.4k|  iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
 2019|  57.4k|  iadst16_stage9_ssse3(output, x);
 2020|  57.4k|}
av1_inv_txfm_ssse3.c:iadst16_stage8_ssse3:
 1945|   217k|                                        int8_t cos_bit) {
 1946|   217k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|   217k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1947|   217k|  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
  ------------------
  |  |   20|   217k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1948|   217k|  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   61|   217k|  do {                                            \
  |  |   62|   217k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   217k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   217k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   217k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   217k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   217k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   217k|                                                  \
  |  |   69|   217k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   217k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   217k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   217k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   217k|                                                  \
  |  |   74|   217k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   217k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   217k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   217k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   217k|                                                  \
  |  |   79|   217k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   217k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   217k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1949|   217k|  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
  ------------------
  |  |   61|   217k|  do {                                            \
  |  |   62|   217k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   217k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   217k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   217k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   217k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   217k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   217k|                                                  \
  |  |   69|   217k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   217k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   217k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   217k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   217k|                                                  \
  |  |   74|   217k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   217k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   217k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   217k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   217k|                                                  \
  |  |   79|   217k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   217k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   217k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1950|   217k|  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
  ------------------
  |  |   61|   217k|  do {                                            \
  |  |   62|   217k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   217k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   217k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   217k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   217k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   217k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   217k|                                                  \
  |  |   69|   217k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   217k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   217k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   217k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   217k|                                                  \
  |  |   74|   217k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   217k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   217k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   217k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   217k|                                                  \
  |  |   79|   217k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   217k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   217k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1951|   217k|  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
  ------------------
  |  |   61|   217k|  do {                                            \
  |  |   62|   217k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   217k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   217k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   217k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   217k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   217k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   217k|                                                  \
  |  |   69|   217k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   217k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   217k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   217k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   217k|                                                  \
  |  |   74|   217k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   217k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   217k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   217k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   217k|                                                  \
  |  |   79|   217k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   217k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   217k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1952|   217k|}
av1_inv_txfm_ssse3.c:iadst16_stage9_ssse3:
 1954|   459k|static inline void iadst16_stage9_ssse3(__m128i *output, __m128i *x) {
 1955|   459k|  const __m128i __zero = _mm_setzero_si128();
 1956|   459k|  output[0] = x[0];
 1957|   459k|  output[1] = _mm_subs_epi16(__zero, x[8]);
 1958|   459k|  output[2] = x[12];
 1959|   459k|  output[3] = _mm_subs_epi16(__zero, x[4]);
 1960|   459k|  output[4] = x[6];
 1961|   459k|  output[5] = _mm_subs_epi16(__zero, x[14]);
 1962|   459k|  output[6] = x[10];
 1963|   459k|  output[7] = _mm_subs_epi16(__zero, x[2]);
 1964|   459k|  output[8] = x[3];
 1965|   459k|  output[9] = _mm_subs_epi16(__zero, x[11]);
 1966|   459k|  output[10] = x[15];
 1967|   459k|  output[11] = _mm_subs_epi16(__zero, x[7]);
 1968|   459k|  output[12] = x[5];
 1969|   459k|  output[13] = _mm_subs_epi16(__zero, x[13]);
 1970|   459k|  output[14] = x[9];
 1971|   459k|  output[15] = _mm_subs_epi16(__zero, x[1]);
 1972|   459k|}
av1_inv_txfm_ssse3.c:iadst16_low8_ssse3:
 2022|   122k|static void iadst16_low8_ssse3(const __m128i *input, __m128i *output) {
 2023|   122k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|   122k|#define INV_COS_BIT 12
  ------------------
 2024|   122k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   122k|#define INV_COS_BIT 12
  ------------------
 2025|   122k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|   122k|#define INV_COS_BIT 12
  ------------------
 2026|       |
 2027|       |  // stage 1
 2028|   122k|  __m128i x[16];
 2029|   122k|  x[1] = input[0];
 2030|   122k|  x[3] = input[2];
 2031|   122k|  x[5] = input[4];
 2032|   122k|  x[7] = input[6];
 2033|   122k|  x[8] = input[7];
 2034|   122k|  x[10] = input[5];
 2035|   122k|  x[12] = input[3];
 2036|   122k|  x[14] = input[1];
 2037|       |
 2038|       |  // stage 2
 2039|   122k|  btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
  ------------------
  |  |   28|   122k|  do {                                          \
  |  |   29|   122k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|   122k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|   122k|    const __m128i _in = in;                     \
  |  |   32|   122k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|   122k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|   122k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2040|   122k|  btf_16_ssse3(cospi[54], -cospi[10], x[3], x[2], x[3]);
  ------------------
  |  |   28|   122k|  do {                                          \
  |  |   29|   122k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|   122k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|   122k|    const __m128i _in = in;                     \
  |  |   32|   122k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|   122k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|   122k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2041|   122k|  btf_16_ssse3(cospi[46], -cospi[18], x[5], x[4], x[5]);
  ------------------
  |  |   28|   122k|  do {                                          \
  |  |   29|   122k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|   122k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|   122k|    const __m128i _in = in;                     \
  |  |   32|   122k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|   122k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|   122k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2042|   122k|  btf_16_ssse3(cospi[38], -cospi[26], x[7], x[6], x[7]);
  ------------------
  |  |   28|   122k|  do {                                          \
  |  |   29|   122k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|   122k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|   122k|    const __m128i _in = in;                     \
  |  |   32|   122k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|   122k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|   122k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2043|   122k|  btf_16_ssse3(cospi[34], cospi[30], x[8], x[8], x[9]);
  ------------------
  |  |   28|   122k|  do {                                          \
  |  |   29|   122k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|   122k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|   122k|    const __m128i _in = in;                     \
  |  |   32|   122k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|   122k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|   122k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2044|   122k|  btf_16_ssse3(cospi[42], cospi[22], x[10], x[10], x[11]);
  ------------------
  |  |   28|   122k|  do {                                          \
  |  |   29|   122k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|   122k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|   122k|    const __m128i _in = in;                     \
  |  |   32|   122k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|   122k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|   122k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2045|   122k|  btf_16_ssse3(cospi[50], cospi[14], x[12], x[12], x[13]);
  ------------------
  |  |   28|   122k|  do {                                          \
  |  |   29|   122k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|   122k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|   122k|    const __m128i _in = in;                     \
  |  |   32|   122k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|   122k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|   122k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2046|   122k|  btf_16_ssse3(cospi[58], cospi[6], x[14], x[14], x[15]);
  ------------------
  |  |   28|   122k|  do {                                          \
  |  |   29|   122k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|   122k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|   122k|    const __m128i _in = in;                     \
  |  |   32|   122k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|   122k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|   122k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2047|       |
 2048|       |  // stage 3
 2049|   122k|  iadst16_stage3_ssse3(x);
 2050|   122k|  iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
 2051|   122k|  iadst16_stage5_ssse3(x);
 2052|   122k|  iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
 2053|   122k|  iadst16_stage7_ssse3(x);
 2054|   122k|  iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
 2055|   122k|  iadst16_stage9_ssse3(output, x);
 2056|   122k|}
av1_inv_txfm_ssse3.c:iadst16_stage3_ssse3:
 1883|   402k|static inline void iadst16_stage3_ssse3(__m128i *x) {
 1884|   402k|  btf_16_adds_subs_sse2(x[0], x[8]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1885|   402k|  btf_16_adds_subs_sse2(x[1], x[9]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1886|   402k|  btf_16_adds_subs_sse2(x[2], x[10]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1887|   402k|  btf_16_adds_subs_sse2(x[3], x[11]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1888|   402k|  btf_16_adds_subs_sse2(x[4], x[12]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1889|   402k|  btf_16_adds_subs_sse2(x[5], x[13]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1890|   402k|  btf_16_adds_subs_sse2(x[6], x[14]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1891|   402k|  btf_16_adds_subs_sse2(x[7], x[15]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1892|   402k|}
av1_inv_txfm_ssse3.c:iadst16_stage4_ssse3:
 1896|   160k|                                        int8_t cos_bit) {
 1897|   160k|  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
  ------------------
  |  |   20|   160k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1898|   160k|  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
  ------------------
  |  |   20|   160k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1899|   160k|  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
  ------------------
  |  |   20|   160k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1900|   160k|  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
  ------------------
  |  |   20|   160k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1901|   160k|  const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
  ------------------
  |  |   20|   160k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1902|   160k|  const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
  ------------------
  |  |   20|   160k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1903|   160k|  btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
  ------------------
  |  |   61|   160k|  do {                                            \
  |  |   62|   160k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   160k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   160k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   160k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   160k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   160k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   160k|                                                  \
  |  |   69|   160k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   160k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   160k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   160k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   160k|                                                  \
  |  |   74|   160k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   160k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   160k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   160k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   160k|                                                  \
  |  |   79|   160k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   160k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   160k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1904|   160k|  btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
  ------------------
  |  |   61|   160k|  do {                                            \
  |  |   62|   160k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   160k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   160k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   160k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   160k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   160k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   160k|                                                  \
  |  |   69|   160k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   160k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   160k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   160k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   160k|                                                  \
  |  |   74|   160k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   160k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   160k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   160k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   160k|                                                  \
  |  |   79|   160k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   160k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   160k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1905|   160k|  btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
  ------------------
  |  |   61|   160k|  do {                                            \
  |  |   62|   160k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   160k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   160k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   160k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   160k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   160k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   160k|                                                  \
  |  |   69|   160k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   160k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   160k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   160k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   160k|                                                  \
  |  |   74|   160k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   160k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   160k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   160k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   160k|                                                  \
  |  |   79|   160k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   160k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   160k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1906|   160k|  btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
  ------------------
  |  |   61|   160k|  do {                                            \
  |  |   62|   160k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   160k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   160k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   160k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   160k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   160k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   160k|                                                  \
  |  |   69|   160k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   160k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   160k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   160k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   160k|                                                  \
  |  |   74|   160k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   160k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   160k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   160k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   160k|                                                  \
  |  |   79|   160k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   160k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   160k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1907|   160k|}
av1_inv_txfm_ssse3.c:iadst16_stage5_ssse3:
 1909|   402k|static inline void iadst16_stage5_ssse3(__m128i *x) {
 1910|   402k|  btf_16_adds_subs_sse2(x[0], x[4]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1911|   402k|  btf_16_adds_subs_sse2(x[1], x[5]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1912|   402k|  btf_16_adds_subs_sse2(x[2], x[6]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1913|   402k|  btf_16_adds_subs_sse2(x[3], x[7]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1914|   402k|  btf_16_adds_subs_sse2(x[8], x[12]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1915|   402k|  btf_16_adds_subs_sse2(x[9], x[13]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1916|   402k|  btf_16_adds_subs_sse2(x[10], x[14]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1917|   402k|  btf_16_adds_subs_sse2(x[11], x[15]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1918|   402k|}
av1_inv_txfm_ssse3.c:iadst16_stage6_ssse3:
 1922|   160k|                                        int8_t cos_bit) {
 1923|   160k|  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
  ------------------
  |  |   20|   160k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1924|   160k|  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
  ------------------
  |  |   20|   160k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1925|   160k|  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
  ------------------
  |  |   20|   160k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1926|   160k|  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
  ------------------
  |  |   61|   160k|  do {                                            \
  |  |   62|   160k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   160k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   160k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   160k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   160k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   160k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   160k|                                                  \
  |  |   69|   160k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   160k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   160k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   160k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   160k|                                                  \
  |  |   74|   160k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   160k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   160k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   160k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   160k|                                                  \
  |  |   79|   160k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   160k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   160k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1927|   160k|  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
  ------------------
  |  |   61|   160k|  do {                                            \
  |  |   62|   160k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   160k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   160k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   160k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   160k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   160k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   160k|                                                  \
  |  |   69|   160k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   160k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   160k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   160k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   160k|                                                  \
  |  |   74|   160k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   160k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   160k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   160k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   160k|                                                  \
  |  |   79|   160k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   160k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   160k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1928|   160k|  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
  ------------------
  |  |   61|   160k|  do {                                            \
  |  |   62|   160k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   160k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   160k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   160k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   160k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   160k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   160k|                                                  \
  |  |   69|   160k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   160k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   160k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   160k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   160k|                                                  \
  |  |   74|   160k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   160k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   160k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   160k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   160k|                                                  \
  |  |   79|   160k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   160k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   160k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1929|   160k|  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
  ------------------
  |  |   61|   160k|  do {                                            \
  |  |   62|   160k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   160k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   160k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   160k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   160k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   160k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   160k|                                                  \
  |  |   69|   160k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   160k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   160k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   160k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   160k|                                                  \
  |  |   74|   160k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   160k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   160k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   160k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   160k|                                                  \
  |  |   79|   160k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   160k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   160k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1930|   160k|}
av1_inv_txfm_ssse3.c:iadst16_stage7_ssse3:
 1932|   402k|static inline void iadst16_stage7_ssse3(__m128i *x) {
 1933|   402k|  btf_16_adds_subs_sse2(x[0], x[2]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1934|   402k|  btf_16_adds_subs_sse2(x[1], x[3]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1935|   402k|  btf_16_adds_subs_sse2(x[4], x[6]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1936|   402k|  btf_16_adds_subs_sse2(x[5], x[7]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1937|   402k|  btf_16_adds_subs_sse2(x[8], x[10]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1938|   402k|  btf_16_adds_subs_sse2(x[9], x[11]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1939|   402k|  btf_16_adds_subs_sse2(x[12], x[14]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1940|   402k|  btf_16_adds_subs_sse2(x[13], x[15]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1941|   402k|}
av1_inv_txfm_ssse3.c:iadst16_sse2:
 2057|  37.7k|static void iadst16_sse2(const __m128i *input, __m128i *output) {
 2058|  37.7k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|  37.7k|#define INV_COS_BIT 12
  ------------------
 2059|  37.7k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  37.7k|#define INV_COS_BIT 12
  ------------------
 2060|  37.7k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  37.7k|#define INV_COS_BIT 12
  ------------------
 2061|  37.7k|  const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
  ------------------
  |  |   20|  37.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2062|  37.7k|  const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
  ------------------
  |  |   20|  37.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2063|  37.7k|  const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
  ------------------
  |  |   20|  37.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2064|  37.7k|  const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
  ------------------
  |  |   20|  37.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2065|  37.7k|  const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
  ------------------
  |  |   20|  37.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2066|  37.7k|  const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
  ------------------
  |  |   20|  37.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2067|  37.7k|  const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
  ------------------
  |  |   20|  37.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2068|  37.7k|  const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
  ------------------
  |  |   20|  37.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2069|  37.7k|  const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
  ------------------
  |  |   20|  37.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2070|  37.7k|  const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
  ------------------
  |  |   20|  37.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2071|  37.7k|  const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
  ------------------
  |  |   20|  37.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2072|  37.7k|  const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
  ------------------
  |  |   20|  37.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2073|  37.7k|  const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
  ------------------
  |  |   20|  37.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2074|  37.7k|  const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
  ------------------
  |  |   20|  37.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2075|  37.7k|  const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
  ------------------
  |  |   20|  37.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2076|  37.7k|  const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
  ------------------
  |  |   20|  37.7k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2077|       |
 2078|       |  // stage 1
 2079|  37.7k|  __m128i x[16];
 2080|  37.7k|  x[0] = input[15];
 2081|  37.7k|  x[1] = input[0];
 2082|  37.7k|  x[2] = input[13];
 2083|  37.7k|  x[3] = input[2];
 2084|  37.7k|  x[4] = input[11];
 2085|  37.7k|  x[5] = input[4];
 2086|  37.7k|  x[6] = input[9];
 2087|  37.7k|  x[7] = input[6];
 2088|  37.7k|  x[8] = input[7];
 2089|  37.7k|  x[9] = input[8];
 2090|  37.7k|  x[10] = input[5];
 2091|  37.7k|  x[11] = input[10];
 2092|  37.7k|  x[12] = input[3];
 2093|  37.7k|  x[13] = input[12];
 2094|  37.7k|  x[14] = input[1];
 2095|  37.7k|  x[15] = input[14];
 2096|       |
 2097|       |  // stage 2
 2098|  37.7k|  btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
  ------------------
  |  |   61|  37.7k|  do {                                            \
  |  |   62|  37.7k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  37.7k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  37.7k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  37.7k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  37.7k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  37.7k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  37.7k|                                                  \
  |  |   69|  37.7k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  37.7k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  37.7k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  37.7k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  37.7k|                                                  \
  |  |   74|  37.7k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  37.7k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  37.7k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  37.7k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  37.7k|                                                  \
  |  |   79|  37.7k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  37.7k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  37.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2099|  37.7k|  btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   61|  37.7k|  do {                                            \
  |  |   62|  37.7k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  37.7k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  37.7k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  37.7k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  37.7k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  37.7k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  37.7k|                                                  \
  |  |   69|  37.7k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  37.7k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  37.7k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  37.7k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  37.7k|                                                  \
  |  |   74|  37.7k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  37.7k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  37.7k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  37.7k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  37.7k|                                                  \
  |  |   79|  37.7k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  37.7k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  37.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2100|  37.7k|  btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
  ------------------
  |  |   61|  37.7k|  do {                                            \
  |  |   62|  37.7k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  37.7k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  37.7k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  37.7k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  37.7k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  37.7k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  37.7k|                                                  \
  |  |   69|  37.7k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  37.7k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  37.7k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  37.7k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  37.7k|                                                  \
  |  |   74|  37.7k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  37.7k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  37.7k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  37.7k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  37.7k|                                                  \
  |  |   79|  37.7k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  37.7k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  37.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2101|  37.7k|  btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
  ------------------
  |  |   61|  37.7k|  do {                                            \
  |  |   62|  37.7k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  37.7k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  37.7k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  37.7k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  37.7k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  37.7k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  37.7k|                                                  \
  |  |   69|  37.7k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  37.7k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  37.7k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  37.7k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  37.7k|                                                  \
  |  |   74|  37.7k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  37.7k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  37.7k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  37.7k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  37.7k|                                                  \
  |  |   79|  37.7k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  37.7k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  37.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2102|  37.7k|  btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
  ------------------
  |  |   61|  37.7k|  do {                                            \
  |  |   62|  37.7k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  37.7k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  37.7k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  37.7k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  37.7k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  37.7k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  37.7k|                                                  \
  |  |   69|  37.7k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  37.7k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  37.7k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  37.7k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  37.7k|                                                  \
  |  |   74|  37.7k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  37.7k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  37.7k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  37.7k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  37.7k|                                                  \
  |  |   79|  37.7k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  37.7k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  37.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2103|  37.7k|  btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
  ------------------
  |  |   61|  37.7k|  do {                                            \
  |  |   62|  37.7k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  37.7k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  37.7k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  37.7k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  37.7k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  37.7k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  37.7k|                                                  \
  |  |   69|  37.7k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  37.7k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  37.7k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  37.7k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  37.7k|                                                  \
  |  |   74|  37.7k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  37.7k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  37.7k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  37.7k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  37.7k|                                                  \
  |  |   79|  37.7k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  37.7k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  37.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2104|  37.7k|  btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
  ------------------
  |  |   61|  37.7k|  do {                                            \
  |  |   62|  37.7k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  37.7k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  37.7k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  37.7k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  37.7k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  37.7k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  37.7k|                                                  \
  |  |   69|  37.7k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  37.7k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  37.7k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  37.7k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  37.7k|                                                  \
  |  |   74|  37.7k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  37.7k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  37.7k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  37.7k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  37.7k|                                                  \
  |  |   79|  37.7k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  37.7k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  37.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2105|  37.7k|  btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
  ------------------
  |  |   61|  37.7k|  do {                                            \
  |  |   62|  37.7k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  37.7k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  37.7k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  37.7k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  37.7k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  37.7k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  37.7k|                                                  \
  |  |   69|  37.7k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  37.7k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  37.7k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  37.7k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  37.7k|                                                  \
  |  |   74|  37.7k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  37.7k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  37.7k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  37.7k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  37.7k|                                                  \
  |  |   79|  37.7k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  37.7k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  37.7k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2106|       |
 2107|       |  // stage 3~9
 2108|  37.7k|  iadst16_stage3_ssse3(x);
 2109|  37.7k|  iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
 2110|  37.7k|  iadst16_stage5_ssse3(x);
 2111|  37.7k|  iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
 2112|  37.7k|  iadst16_stage7_ssse3(x);
 2113|  37.7k|  iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
 2114|  37.7k|  iadst16_stage9_ssse3(output, x);
 2115|  37.7k|}
av1_inv_txfm_ssse3.c:idct32_low1_ssse3:
  597|   132k|static void idct32_low1_ssse3(const __m128i *input, __m128i *output) {
  598|   132k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   132k|#define INV_COS_BIT 12
  ------------------
  599|       |
  600|       |  // stage 1
  601|   132k|  __m128i x[2];
  602|   132k|  x[0] = input[0];
  603|       |
  604|       |  // stage 2
  605|       |  // stage 3
  606|       |  // stage 4
  607|       |  // stage 5
  608|   132k|  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
  ------------------
  |  |   28|   132k|  do {                                          \
  |  |   29|   132k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|   132k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|   132k|    const __m128i _in = in;                     \
  |  |   32|   132k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|   132k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|   132k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  609|       |
  610|       |  // stage 6
  611|       |  // stage 7
  612|       |  // stage 8
  613|       |  // stage 9
  614|   132k|  output[0] = x[0];
  615|   132k|  output[31] = x[0];
  616|   132k|  output[1] = x[1];
  617|   132k|  output[30] = x[1];
  618|   132k|  output[2] = x[1];
  619|   132k|  output[29] = x[1];
  620|   132k|  output[3] = x[0];
  621|   132k|  output[28] = x[0];
  622|   132k|  output[4] = x[0];
  623|   132k|  output[27] = x[0];
  624|   132k|  output[5] = x[1];
  625|   132k|  output[26] = x[1];
  626|   132k|  output[6] = x[1];
  627|   132k|  output[25] = x[1];
  628|   132k|  output[7] = x[0];
  629|   132k|  output[24] = x[0];
  630|   132k|  output[8] = x[0];
  631|   132k|  output[23] = x[0];
  632|   132k|  output[9] = x[1];
  633|   132k|  output[22] = x[1];
  634|   132k|  output[10] = x[1];
  635|   132k|  output[21] = x[1];
  636|   132k|  output[11] = x[0];
  637|   132k|  output[20] = x[0];
  638|   132k|  output[12] = x[0];
  639|   132k|  output[19] = x[0];
  640|   132k|  output[13] = x[1];
  641|   132k|  output[18] = x[1];
  642|   132k|  output[14] = x[1];
  643|   132k|  output[17] = x[1];
  644|   132k|  output[15] = x[0];
  645|   132k|  output[16] = x[0];
  646|   132k|}
av1_inv_txfm_ssse3.c:idct32_low8_ssse3:
  648|   273k|static void idct32_low8_ssse3(const __m128i *input, __m128i *output) {
  649|   273k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|   273k|#define INV_COS_BIT 12
  ------------------
  650|   273k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   273k|#define INV_COS_BIT 12
  ------------------
  651|   273k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|   273k|#define INV_COS_BIT 12
  ------------------
  652|       |
  653|       |  // stage 1
  654|   273k|  __m128i x[32];
  655|   273k|  x[0] = input[0];
  656|   273k|  x[4] = input[4];
  657|   273k|  x[8] = input[2];
  658|   273k|  x[12] = input[6];
  659|   273k|  x[16] = input[1];
  660|   273k|  x[20] = input[5];
  661|   273k|  x[24] = input[3];
  662|   273k|  x[28] = input[7];
  663|       |
  664|       |  // stage 2
  665|   273k|  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
  ------------------
  |  |   28|   273k|  do {                                          \
  |  |   29|   273k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|   273k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|   273k|    const __m128i _in = in;                     \
  |  |   32|   273k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|   273k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|   273k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  666|   273k|  btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
  ------------------
  |  |   28|   273k|  do {                                          \
  |  |   29|   273k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|   273k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|   273k|    const __m128i _in = in;                     \
  |  |   32|   273k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|   273k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|   273k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  667|   273k|  btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
  ------------------
  |  |   28|   273k|  do {                                          \
  |  |   29|   273k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|   273k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|   273k|    const __m128i _in = in;                     \
  |  |   32|   273k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|   273k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|   273k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  668|   273k|  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
  ------------------
  |  |   28|   273k|  do {                                          \
  |  |   29|   273k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|   273k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|   273k|    const __m128i _in = in;                     \
  |  |   32|   273k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|   273k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|   273k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  669|       |
  670|       |  // stage 3
  671|   273k|  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
  ------------------
  |  |   28|   273k|  do {                                          \
  |  |   29|   273k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|   273k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|   273k|    const __m128i _in = in;                     \
  |  |   32|   273k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|   273k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|   273k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  672|   273k|  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
  ------------------
  |  |   28|   273k|  do {                                          \
  |  |   29|   273k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|   273k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|   273k|    const __m128i _in = in;                     \
  |  |   32|   273k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|   273k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|   273k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  673|   273k|  x[17] = x[16];
  674|   273k|  x[18] = x[19];
  675|   273k|  x[21] = x[20];
  676|   273k|  x[22] = x[23];
  677|   273k|  x[25] = x[24];
  678|   273k|  x[26] = x[27];
  679|   273k|  x[29] = x[28];
  680|   273k|  x[30] = x[31];
  681|       |
  682|       |  // stage 4
  683|   273k|  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
  ------------------
  |  |   28|   273k|  do {                                          \
  |  |   29|   273k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|   273k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|   273k|    const __m128i _in = in;                     \
  |  |   32|   273k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|   273k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|   273k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  684|   273k|  x[9] = x[8];
  685|   273k|  x[10] = x[11];
  686|   273k|  x[13] = x[12];
  687|   273k|  x[14] = x[15];
  688|   273k|  idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
  689|       |
  690|       |  // stage 5
  691|   273k|  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
  ------------------
  |  |   28|   273k|  do {                                          \
  |  |   29|   273k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|   273k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|   273k|    const __m128i _in = in;                     \
  |  |   32|   273k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|   273k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|   273k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  692|   273k|  x[5] = x[4];
  693|   273k|  x[6] = x[7];
  694|   273k|  idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
  695|       |  // stage 6
  696|   273k|  x[3] = x[0];
  697|   273k|  x[2] = x[1];
  698|   273k|  idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
  699|       |
  700|   273k|  idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
  701|   273k|  idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
  702|   273k|  idct32_stage9_sse2(output, x);
  703|   273k|}
av1_inv_txfm_ssse3.c:idct32_high16_stage4_sse2:
  488|   340k|                                             int8_t cos_bit) {
  489|   340k|  const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
  ------------------
  |  |   20|   340k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  490|   340k|  const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
  ------------------
  |  |   20|   340k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  491|   340k|  const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
  ------------------
  |  |   20|   340k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  492|   340k|  const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
  ------------------
  |  |   20|   340k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  493|   340k|  const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
  ------------------
  |  |   20|   340k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  494|   340k|  const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
  ------------------
  |  |   20|   340k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  495|   340k|  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
  ------------------
  |  |   61|   340k|  do {                                            \
  |  |   62|   340k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   340k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   340k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   340k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   340k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   340k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   340k|                                                  \
  |  |   69|   340k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   340k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   340k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   340k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   340k|                                                  \
  |  |   74|   340k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   340k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   340k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   340k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   340k|                                                  \
  |  |   79|   340k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   340k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  496|   340k|  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
  ------------------
  |  |   61|   340k|  do {                                            \
  |  |   62|   340k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   340k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   340k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   340k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   340k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   340k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   340k|                                                  \
  |  |   69|   340k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   340k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   340k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   340k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   340k|                                                  \
  |  |   74|   340k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   340k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   340k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   340k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   340k|                                                  \
  |  |   79|   340k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   340k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  497|   340k|  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
  ------------------
  |  |   61|   340k|  do {                                            \
  |  |   62|   340k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   340k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   340k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   340k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   340k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   340k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   340k|                                                  \
  |  |   69|   340k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   340k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   340k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   340k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   340k|                                                  \
  |  |   74|   340k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   340k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   340k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   340k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   340k|                                                  \
  |  |   79|   340k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   340k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  498|   340k|  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
  ------------------
  |  |   61|   340k|  do {                                            \
  |  |   62|   340k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   340k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   340k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   340k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   340k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   340k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   340k|                                                  \
  |  |   69|   340k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   340k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   340k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   340k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   340k|                                                  \
  |  |   74|   340k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   340k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   340k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   340k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   340k|                                                  \
  |  |   79|   340k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   340k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  499|   340k|}
av1_inv_txfm_ssse3.c:idct32_high24_stage5_sse2:
  503|   340k|                                             int8_t cos_bit) {
  504|   340k|  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
  ------------------
  |  |   20|   340k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  505|   340k|  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
  ------------------
  |  |   20|   340k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  506|   340k|  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
  ------------------
  |  |   20|   340k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  507|   340k|  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
  ------------------
  |  |   61|   340k|  do {                                            \
  |  |   62|   340k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   340k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   340k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   340k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   340k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   340k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   340k|                                                  \
  |  |   69|   340k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   340k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   340k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   340k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   340k|                                                  \
  |  |   74|   340k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   340k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   340k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   340k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   340k|                                                  \
  |  |   79|   340k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   340k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  508|   340k|  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
  ------------------
  |  |   61|   340k|  do {                                            \
  |  |   62|   340k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   340k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   340k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   340k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   340k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   340k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   340k|                                                  \
  |  |   69|   340k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   340k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   340k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   340k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   340k|                                                  \
  |  |   74|   340k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   340k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   340k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   340k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   340k|                                                  \
  |  |   79|   340k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   340k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  509|   340k|  btf_16_adds_subs_sse2(x[16], x[19]);
  ------------------
  |  |   37|   340k|  do {                                  \
  |  |   38|   340k|    const __m128i _in0 = in0;           \
  |  |   39|   340k|    const __m128i _in1 = in1;           \
  |  |   40|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  510|   340k|  btf_16_adds_subs_sse2(x[17], x[18]);
  ------------------
  |  |   37|   340k|  do {                                  \
  |  |   38|   340k|    const __m128i _in0 = in0;           \
  |  |   39|   340k|    const __m128i _in1 = in1;           \
  |  |   40|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  511|   340k|  btf_16_subs_adds_sse2(x[23], x[20]);
  ------------------
  |  |   45|   340k|  do {                                  \
  |  |   46|   340k|    const __m128i _in0 = in0;           \
  |  |   47|   340k|    const __m128i _in1 = in1;           \
  |  |   48|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  512|   340k|  btf_16_subs_adds_sse2(x[22], x[21]);
  ------------------
  |  |   45|   340k|  do {                                  \
  |  |   46|   340k|    const __m128i _in0 = in0;           \
  |  |   47|   340k|    const __m128i _in1 = in1;           \
  |  |   48|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  513|   340k|  btf_16_adds_subs_sse2(x[24], x[27]);
  ------------------
  |  |   37|   340k|  do {                                  \
  |  |   38|   340k|    const __m128i _in0 = in0;           \
  |  |   39|   340k|    const __m128i _in1 = in1;           \
  |  |   40|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  514|   340k|  btf_16_adds_subs_sse2(x[25], x[26]);
  ------------------
  |  |   37|   340k|  do {                                  \
  |  |   38|   340k|    const __m128i _in0 = in0;           \
  |  |   39|   340k|    const __m128i _in1 = in1;           \
  |  |   40|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  515|   340k|  btf_16_subs_adds_sse2(x[31], x[28]);
  ------------------
  |  |   45|   340k|  do {                                  \
  |  |   46|   340k|    const __m128i _in0 = in0;           \
  |  |   47|   340k|    const __m128i _in1 = in1;           \
  |  |   48|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  516|   340k|  btf_16_subs_adds_sse2(x[30], x[29]);
  ------------------
  |  |   45|   340k|  do {                                  \
  |  |   46|   340k|    const __m128i _in0 = in0;           \
  |  |   47|   340k|    const __m128i _in1 = in1;           \
  |  |   48|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  517|   340k|}
av1_inv_txfm_ssse3.c:idct32_high28_stage6_sse2:
  521|   340k|                                             int8_t cos_bit) {
  522|   340k|  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
  ------------------
  |  |   20|   340k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  523|   340k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|   340k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  524|   340k|  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
  ------------------
  |  |   20|   340k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  525|   340k|  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
  ------------------
  |  |   20|   340k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  526|   340k|  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
  ------------------
  |  |   20|   340k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  527|   340k|  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
  ------------------
  |  |   61|   340k|  do {                                            \
  |  |   62|   340k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   340k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   340k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   340k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   340k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   340k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   340k|                                                  \
  |  |   69|   340k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   340k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   340k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   340k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   340k|                                                  \
  |  |   74|   340k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   340k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   340k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   340k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   340k|                                                  \
  |  |   79|   340k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   340k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  528|   340k|  btf_16_adds_subs_sse2(x[8], x[11]);
  ------------------
  |  |   37|   340k|  do {                                  \
  |  |   38|   340k|    const __m128i _in0 = in0;           \
  |  |   39|   340k|    const __m128i _in1 = in1;           \
  |  |   40|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  529|   340k|  btf_16_adds_subs_sse2(x[9], x[10]);
  ------------------
  |  |   37|   340k|  do {                                  \
  |  |   38|   340k|    const __m128i _in0 = in0;           \
  |  |   39|   340k|    const __m128i _in1 = in1;           \
  |  |   40|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  530|   340k|  btf_16_subs_adds_sse2(x[15], x[12]);
  ------------------
  |  |   45|   340k|  do {                                  \
  |  |   46|   340k|    const __m128i _in0 = in0;           \
  |  |   47|   340k|    const __m128i _in1 = in1;           \
  |  |   48|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  531|   340k|  btf_16_subs_adds_sse2(x[14], x[13]);
  ------------------
  |  |   45|   340k|  do {                                  \
  |  |   46|   340k|    const __m128i _in0 = in0;           \
  |  |   47|   340k|    const __m128i _in1 = in1;           \
  |  |   48|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  532|   340k|  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
  ------------------
  |  |   61|   340k|  do {                                            \
  |  |   62|   340k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   340k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   340k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   340k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   340k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   340k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   340k|                                                  \
  |  |   69|   340k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   340k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   340k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   340k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   340k|                                                  \
  |  |   74|   340k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   340k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   340k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   340k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   340k|                                                  \
  |  |   79|   340k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   340k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  533|   340k|  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
  ------------------
  |  |   61|   340k|  do {                                            \
  |  |   62|   340k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   340k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   340k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   340k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   340k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   340k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   340k|                                                  \
  |  |   69|   340k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   340k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   340k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   340k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   340k|                                                  \
  |  |   74|   340k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   340k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   340k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   340k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   340k|                                                  \
  |  |   79|   340k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   340k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  534|   340k|  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
  ------------------
  |  |   61|   340k|  do {                                            \
  |  |   62|   340k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   340k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   340k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   340k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   340k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   340k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   340k|                                                  \
  |  |   69|   340k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   340k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   340k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   340k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   340k|                                                  \
  |  |   74|   340k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   340k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   340k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   340k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   340k|                                                  \
  |  |   79|   340k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   340k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  535|   340k|  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
  ------------------
  |  |   61|   340k|  do {                                            \
  |  |   62|   340k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   340k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   340k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   340k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   340k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   340k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   340k|                                                  \
  |  |   69|   340k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   340k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   340k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   340k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   340k|                                                  \
  |  |   74|   340k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   340k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   340k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   340k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   340k|                                                  \
  |  |   79|   340k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   340k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  536|   340k|}
av1_inv_txfm_ssse3.c:idct32_stage7_sse2:
  540|   340k|                                      int8_t cos_bit) {
  541|   340k|  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
  ------------------
  |  |   20|   340k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  542|   340k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|   340k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  543|   340k|  btf_16_adds_subs_sse2(x[0], x[7]);
  ------------------
  |  |   37|   340k|  do {                                  \
  |  |   38|   340k|    const __m128i _in0 = in0;           \
  |  |   39|   340k|    const __m128i _in1 = in1;           \
  |  |   40|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  544|   340k|  btf_16_adds_subs_sse2(x[1], x[6]);
  ------------------
  |  |   37|   340k|  do {                                  \
  |  |   38|   340k|    const __m128i _in0 = in0;           \
  |  |   39|   340k|    const __m128i _in1 = in1;           \
  |  |   40|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  545|   340k|  btf_16_adds_subs_sse2(x[2], x[5]);
  ------------------
  |  |   37|   340k|  do {                                  \
  |  |   38|   340k|    const __m128i _in0 = in0;           \
  |  |   39|   340k|    const __m128i _in1 = in1;           \
  |  |   40|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  546|   340k|  btf_16_adds_subs_sse2(x[3], x[4]);
  ------------------
  |  |   37|   340k|  do {                                  \
  |  |   38|   340k|    const __m128i _in0 = in0;           \
  |  |   39|   340k|    const __m128i _in1 = in1;           \
  |  |   40|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  547|   340k|  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
  ------------------
  |  |   61|   340k|  do {                                            \
  |  |   62|   340k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   340k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   340k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   340k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   340k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   340k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   340k|                                                  \
  |  |   69|   340k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   340k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   340k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   340k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   340k|                                                  \
  |  |   74|   340k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   340k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   340k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   340k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   340k|                                                  \
  |  |   79|   340k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   340k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  548|   340k|  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
  ------------------
  |  |   61|   340k|  do {                                            \
  |  |   62|   340k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   340k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   340k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   340k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   340k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   340k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   340k|                                                  \
  |  |   69|   340k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   340k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   340k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   340k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   340k|                                                  \
  |  |   74|   340k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   340k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   340k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   340k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   340k|                                                  \
  |  |   79|   340k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   340k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  549|   340k|  btf_16_adds_subs_sse2(x[16], x[23]);
  ------------------
  |  |   37|   340k|  do {                                  \
  |  |   38|   340k|    const __m128i _in0 = in0;           \
  |  |   39|   340k|    const __m128i _in1 = in1;           \
  |  |   40|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  550|   340k|  btf_16_adds_subs_sse2(x[17], x[22]);
  ------------------
  |  |   37|   340k|  do {                                  \
  |  |   38|   340k|    const __m128i _in0 = in0;           \
  |  |   39|   340k|    const __m128i _in1 = in1;           \
  |  |   40|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  551|   340k|  btf_16_adds_subs_sse2(x[18], x[21]);
  ------------------
  |  |   37|   340k|  do {                                  \
  |  |   38|   340k|    const __m128i _in0 = in0;           \
  |  |   39|   340k|    const __m128i _in1 = in1;           \
  |  |   40|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  552|   340k|  btf_16_adds_subs_sse2(x[19], x[20]);
  ------------------
  |  |   37|   340k|  do {                                  \
  |  |   38|   340k|    const __m128i _in0 = in0;           \
  |  |   39|   340k|    const __m128i _in1 = in1;           \
  |  |   40|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  553|   340k|  btf_16_subs_adds_sse2(x[31], x[24]);
  ------------------
  |  |   45|   340k|  do {                                  \
  |  |   46|   340k|    const __m128i _in0 = in0;           \
  |  |   47|   340k|    const __m128i _in1 = in1;           \
  |  |   48|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  554|   340k|  btf_16_subs_adds_sse2(x[30], x[25]);
  ------------------
  |  |   45|   340k|  do {                                  \
  |  |   46|   340k|    const __m128i _in0 = in0;           \
  |  |   47|   340k|    const __m128i _in1 = in1;           \
  |  |   48|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  555|   340k|  btf_16_subs_adds_sse2(x[29], x[26]);
  ------------------
  |  |   45|   340k|  do {                                  \
  |  |   46|   340k|    const __m128i _in0 = in0;           \
  |  |   47|   340k|    const __m128i _in1 = in1;           \
  |  |   48|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  556|   340k|  btf_16_subs_adds_sse2(x[28], x[27]);
  ------------------
  |  |   45|   340k|  do {                                  \
  |  |   46|   340k|    const __m128i _in0 = in0;           \
  |  |   47|   340k|    const __m128i _in1 = in1;           \
  |  |   48|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  557|   340k|}
av1_inv_txfm_ssse3.c:idct32_stage8_sse2:
  561|   340k|                                      int8_t cos_bit) {
  562|   340k|  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
  ------------------
  |  |   20|   340k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  563|   340k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|   340k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  564|   340k|  btf_16_adds_subs_sse2(x[0], x[15]);
  ------------------
  |  |   37|   340k|  do {                                  \
  |  |   38|   340k|    const __m128i _in0 = in0;           \
  |  |   39|   340k|    const __m128i _in1 = in1;           \
  |  |   40|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  565|   340k|  btf_16_adds_subs_sse2(x[1], x[14]);
  ------------------
  |  |   37|   340k|  do {                                  \
  |  |   38|   340k|    const __m128i _in0 = in0;           \
  |  |   39|   340k|    const __m128i _in1 = in1;           \
  |  |   40|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  566|   340k|  btf_16_adds_subs_sse2(x[2], x[13]);
  ------------------
  |  |   37|   340k|  do {                                  \
  |  |   38|   340k|    const __m128i _in0 = in0;           \
  |  |   39|   340k|    const __m128i _in1 = in1;           \
  |  |   40|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  567|   340k|  btf_16_adds_subs_sse2(x[3], x[12]);
  ------------------
  |  |   37|   340k|  do {                                  \
  |  |   38|   340k|    const __m128i _in0 = in0;           \
  |  |   39|   340k|    const __m128i _in1 = in1;           \
  |  |   40|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  568|   340k|  btf_16_adds_subs_sse2(x[4], x[11]);
  ------------------
  |  |   37|   340k|  do {                                  \
  |  |   38|   340k|    const __m128i _in0 = in0;           \
  |  |   39|   340k|    const __m128i _in1 = in1;           \
  |  |   40|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  569|   340k|  btf_16_adds_subs_sse2(x[5], x[10]);
  ------------------
  |  |   37|   340k|  do {                                  \
  |  |   38|   340k|    const __m128i _in0 = in0;           \
  |  |   39|   340k|    const __m128i _in1 = in1;           \
  |  |   40|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  570|   340k|  btf_16_adds_subs_sse2(x[6], x[9]);
  ------------------
  |  |   37|   340k|  do {                                  \
  |  |   38|   340k|    const __m128i _in0 = in0;           \
  |  |   39|   340k|    const __m128i _in1 = in1;           \
  |  |   40|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  571|   340k|  btf_16_adds_subs_sse2(x[7], x[8]);
  ------------------
  |  |   37|   340k|  do {                                  \
  |  |   38|   340k|    const __m128i _in0 = in0;           \
  |  |   39|   340k|    const __m128i _in1 = in1;           \
  |  |   40|   340k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   340k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  572|   340k|  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
  ------------------
  |  |   61|   340k|  do {                                            \
  |  |   62|   340k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   340k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   340k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   340k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   340k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   340k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   340k|                                                  \
  |  |   69|   340k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   340k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   340k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   340k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   340k|                                                  \
  |  |   74|   340k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   340k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   340k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   340k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   340k|                                                  \
  |  |   79|   340k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   340k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  573|   340k|  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
  ------------------
  |  |   61|   340k|  do {                                            \
  |  |   62|   340k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   340k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   340k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   340k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   340k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   340k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   340k|                                                  \
  |  |   69|   340k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   340k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   340k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   340k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   340k|                                                  \
  |  |   74|   340k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   340k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   340k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   340k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   340k|                                                  \
  |  |   79|   340k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   340k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  574|   340k|  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
  ------------------
  |  |   61|   340k|  do {                                            \
  |  |   62|   340k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   340k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   340k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   340k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   340k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   340k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   340k|                                                  \
  |  |   69|   340k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   340k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   340k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   340k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   340k|                                                  \
  |  |   74|   340k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   340k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   340k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   340k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   340k|                                                  \
  |  |   79|   340k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   340k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  575|   340k|  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
  ------------------
  |  |   61|   340k|  do {                                            \
  |  |   62|   340k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|   340k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|   340k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|   340k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|   340k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|   340k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|   340k|                                                  \
  |  |   69|   340k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|   340k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|   340k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|   340k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|   340k|                                                  \
  |  |   74|   340k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|   340k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|   340k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|   340k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|   340k|                                                  \
  |  |   79|   340k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|   340k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  576|   340k|}
av1_inv_txfm_ssse3.c:idct32_stage9_sse2:
  578|   340k|static inline void idct32_stage9_sse2(__m128i *output, __m128i *x) {
  579|   340k|  btf_16_adds_subs_out_sse2(output[0], output[31], x[0], x[31]);
  ------------------
  |  |   53|   340k|  do {                                                  \
  |  |   54|   340k|    const __m128i _in0 = in0;                           \
  |  |   55|   340k|    const __m128i _in1 = in1;                           \
  |  |   56|   340k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   340k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  580|   340k|  btf_16_adds_subs_out_sse2(output[1], output[30], x[1], x[30]);
  ------------------
  |  |   53|   340k|  do {                                                  \
  |  |   54|   340k|    const __m128i _in0 = in0;                           \
  |  |   55|   340k|    const __m128i _in1 = in1;                           \
  |  |   56|   340k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   340k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  581|   340k|  btf_16_adds_subs_out_sse2(output[2], output[29], x[2], x[29]);
  ------------------
  |  |   53|   340k|  do {                                                  \
  |  |   54|   340k|    const __m128i _in0 = in0;                           \
  |  |   55|   340k|    const __m128i _in1 = in1;                           \
  |  |   56|   340k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   340k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  582|   340k|  btf_16_adds_subs_out_sse2(output[3], output[28], x[3], x[28]);
  ------------------
  |  |   53|   340k|  do {                                                  \
  |  |   54|   340k|    const __m128i _in0 = in0;                           \
  |  |   55|   340k|    const __m128i _in1 = in1;                           \
  |  |   56|   340k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   340k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  583|   340k|  btf_16_adds_subs_out_sse2(output[4], output[27], x[4], x[27]);
  ------------------
  |  |   53|   340k|  do {                                                  \
  |  |   54|   340k|    const __m128i _in0 = in0;                           \
  |  |   55|   340k|    const __m128i _in1 = in1;                           \
  |  |   56|   340k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   340k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  584|   340k|  btf_16_adds_subs_out_sse2(output[5], output[26], x[5], x[26]);
  ------------------
  |  |   53|   340k|  do {                                                  \
  |  |   54|   340k|    const __m128i _in0 = in0;                           \
  |  |   55|   340k|    const __m128i _in1 = in1;                           \
  |  |   56|   340k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   340k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  585|   340k|  btf_16_adds_subs_out_sse2(output[6], output[25], x[6], x[25]);
  ------------------
  |  |   53|   340k|  do {                                                  \
  |  |   54|   340k|    const __m128i _in0 = in0;                           \
  |  |   55|   340k|    const __m128i _in1 = in1;                           \
  |  |   56|   340k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   340k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  586|   340k|  btf_16_adds_subs_out_sse2(output[7], output[24], x[7], x[24]);
  ------------------
  |  |   53|   340k|  do {                                                  \
  |  |   54|   340k|    const __m128i _in0 = in0;                           \
  |  |   55|   340k|    const __m128i _in1 = in1;                           \
  |  |   56|   340k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   340k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  587|   340k|  btf_16_adds_subs_out_sse2(output[8], output[23], x[8], x[23]);
  ------------------
  |  |   53|   340k|  do {                                                  \
  |  |   54|   340k|    const __m128i _in0 = in0;                           \
  |  |   55|   340k|    const __m128i _in1 = in1;                           \
  |  |   56|   340k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   340k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  588|   340k|  btf_16_adds_subs_out_sse2(output[9], output[22], x[9], x[22]);
  ------------------
  |  |   53|   340k|  do {                                                  \
  |  |   54|   340k|    const __m128i _in0 = in0;                           \
  |  |   55|   340k|    const __m128i _in1 = in1;                           \
  |  |   56|   340k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   340k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  589|   340k|  btf_16_adds_subs_out_sse2(output[10], output[21], x[10], x[21]);
  ------------------
  |  |   53|   340k|  do {                                                  \
  |  |   54|   340k|    const __m128i _in0 = in0;                           \
  |  |   55|   340k|    const __m128i _in1 = in1;                           \
  |  |   56|   340k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   340k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  590|   340k|  btf_16_adds_subs_out_sse2(output[11], output[20], x[11], x[20]);
  ------------------
  |  |   53|   340k|  do {                                                  \
  |  |   54|   340k|    const __m128i _in0 = in0;                           \
  |  |   55|   340k|    const __m128i _in1 = in1;                           \
  |  |   56|   340k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   340k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  591|   340k|  btf_16_adds_subs_out_sse2(output[12], output[19], x[12], x[19]);
  ------------------
  |  |   53|   340k|  do {                                                  \
  |  |   54|   340k|    const __m128i _in0 = in0;                           \
  |  |   55|   340k|    const __m128i _in1 = in1;                           \
  |  |   56|   340k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   340k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  592|   340k|  btf_16_adds_subs_out_sse2(output[13], output[18], x[13], x[18]);
  ------------------
  |  |   53|   340k|  do {                                                  \
  |  |   54|   340k|    const __m128i _in0 = in0;                           \
  |  |   55|   340k|    const __m128i _in1 = in1;                           \
  |  |   56|   340k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   340k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  593|   340k|  btf_16_adds_subs_out_sse2(output[14], output[17], x[14], x[17]);
  ------------------
  |  |   53|   340k|  do {                                                  \
  |  |   54|   340k|    const __m128i _in0 = in0;                           \
  |  |   55|   340k|    const __m128i _in1 = in1;                           \
  |  |   56|   340k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   340k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  594|   340k|  btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]);
  ------------------
  |  |   53|   340k|  do {                                                  \
  |  |   54|   340k|    const __m128i _in0 = in0;                           \
  |  |   55|   340k|    const __m128i _in1 = in1;                           \
  |  |   56|   340k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   340k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   340k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  595|   340k|}
av1_inv_txfm_ssse3.c:idct32_low16_ssse3:
  705|  50.8k|static void idct32_low16_ssse3(const __m128i *input, __m128i *output) {
  706|  50.8k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|  50.8k|#define INV_COS_BIT 12
  ------------------
  707|  50.8k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  50.8k|#define INV_COS_BIT 12
  ------------------
  708|  50.8k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  50.8k|#define INV_COS_BIT 12
  ------------------
  709|       |
  710|       |  // stage 1
  711|  50.8k|  __m128i x[32];
  712|  50.8k|  x[0] = input[0];
  713|  50.8k|  x[2] = input[8];
  714|  50.8k|  x[4] = input[4];
  715|  50.8k|  x[6] = input[12];
  716|  50.8k|  x[8] = input[2];
  717|  50.8k|  x[10] = input[10];
  718|  50.8k|  x[12] = input[6];
  719|  50.8k|  x[14] = input[14];
  720|  50.8k|  x[16] = input[1];
  721|  50.8k|  x[18] = input[9];
  722|  50.8k|  x[20] = input[5];
  723|  50.8k|  x[22] = input[13];
  724|  50.8k|  x[24] = input[3];
  725|  50.8k|  x[26] = input[11];
  726|  50.8k|  x[28] = input[7];
  727|  50.8k|  x[30] = input[15];
  728|       |
  729|       |  // stage 2
  730|  50.8k|  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
  ------------------
  |  |   28|  50.8k|  do {                                          \
  |  |   29|  50.8k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  50.8k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  50.8k|    const __m128i _in = in;                     \
  |  |   32|  50.8k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  50.8k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  50.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  731|  50.8k|  btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
  ------------------
  |  |   28|  50.8k|  do {                                          \
  |  |   29|  50.8k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  50.8k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  50.8k|    const __m128i _in = in;                     \
  |  |   32|  50.8k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  50.8k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  50.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  732|  50.8k|  btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
  ------------------
  |  |   28|  50.8k|  do {                                          \
  |  |   29|  50.8k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  50.8k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  50.8k|    const __m128i _in = in;                     \
  |  |   32|  50.8k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  50.8k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  50.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  733|  50.8k|  btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
  ------------------
  |  |   28|  50.8k|  do {                                          \
  |  |   29|  50.8k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  50.8k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  50.8k|    const __m128i _in = in;                     \
  |  |   32|  50.8k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  50.8k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  50.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  734|  50.8k|  btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
  ------------------
  |  |   28|  50.8k|  do {                                          \
  |  |   29|  50.8k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  50.8k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  50.8k|    const __m128i _in = in;                     \
  |  |   32|  50.8k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  50.8k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  50.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  735|  50.8k|  btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
  ------------------
  |  |   28|  50.8k|  do {                                          \
  |  |   29|  50.8k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  50.8k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  50.8k|    const __m128i _in = in;                     \
  |  |   32|  50.8k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  50.8k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  50.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  736|  50.8k|  btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
  ------------------
  |  |   28|  50.8k|  do {                                          \
  |  |   29|  50.8k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  50.8k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  50.8k|    const __m128i _in = in;                     \
  |  |   32|  50.8k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  50.8k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  50.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  737|  50.8k|  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
  ------------------
  |  |   28|  50.8k|  do {                                          \
  |  |   29|  50.8k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  50.8k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  50.8k|    const __m128i _in = in;                     \
  |  |   32|  50.8k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  50.8k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  50.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  738|       |
  739|       |  // stage 3
  740|  50.8k|  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
  ------------------
  |  |   28|  50.8k|  do {                                          \
  |  |   29|  50.8k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  50.8k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  50.8k|    const __m128i _in = in;                     \
  |  |   32|  50.8k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  50.8k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  50.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  741|  50.8k|  btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
  ------------------
  |  |   28|  50.8k|  do {                                          \
  |  |   29|  50.8k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  50.8k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  50.8k|    const __m128i _in = in;                     \
  |  |   32|  50.8k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  50.8k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  50.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  742|  50.8k|  btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
  ------------------
  |  |   28|  50.8k|  do {                                          \
  |  |   29|  50.8k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  50.8k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  50.8k|    const __m128i _in = in;                     \
  |  |   32|  50.8k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  50.8k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  50.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  743|  50.8k|  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
  ------------------
  |  |   28|  50.8k|  do {                                          \
  |  |   29|  50.8k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  50.8k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  50.8k|    const __m128i _in = in;                     \
  |  |   32|  50.8k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  50.8k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  50.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  744|  50.8k|  idct32_high16_stage3_sse2(x);
  745|       |
  746|       |  // stage 4
  747|  50.8k|  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
  ------------------
  |  |   28|  50.8k|  do {                                          \
  |  |   29|  50.8k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  50.8k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  50.8k|    const __m128i _in = in;                     \
  |  |   32|  50.8k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  50.8k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  50.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  748|  50.8k|  btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
  ------------------
  |  |   28|  50.8k|  do {                                          \
  |  |   29|  50.8k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  50.8k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  50.8k|    const __m128i _in = in;                     \
  |  |   32|  50.8k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  50.8k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  50.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  749|  50.8k|  btf_16_adds_subs_sse2(x[8], x[9]);
  ------------------
  |  |   37|  50.8k|  do {                                  \
  |  |   38|  50.8k|    const __m128i _in0 = in0;           \
  |  |   39|  50.8k|    const __m128i _in1 = in1;           \
  |  |   40|  50.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  50.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  50.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  750|  50.8k|  btf_16_subs_adds_sse2(x[11], x[10]);
  ------------------
  |  |   45|  50.8k|  do {                                  \
  |  |   46|  50.8k|    const __m128i _in0 = in0;           \
  |  |   47|  50.8k|    const __m128i _in1 = in1;           \
  |  |   48|  50.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  50.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  50.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  751|  50.8k|  btf_16_adds_subs_sse2(x[12], x[13]);
  ------------------
  |  |   37|  50.8k|  do {                                  \
  |  |   38|  50.8k|    const __m128i _in0 = in0;           \
  |  |   39|  50.8k|    const __m128i _in1 = in1;           \
  |  |   40|  50.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  50.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  50.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  752|  50.8k|  btf_16_subs_adds_sse2(x[15], x[14]);
  ------------------
  |  |   45|  50.8k|  do {                                  \
  |  |   46|  50.8k|    const __m128i _in0 = in0;           \
  |  |   47|  50.8k|    const __m128i _in1 = in1;           \
  |  |   48|  50.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  50.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  50.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  753|  50.8k|  idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
  754|       |
  755|       |  // stage 5
  756|  50.8k|  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
  ------------------
  |  |   28|  50.8k|  do {                                          \
  |  |   29|  50.8k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  50.8k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  50.8k|    const __m128i _in = in;                     \
  |  |   32|  50.8k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  50.8k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  50.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  757|  50.8k|  btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
  ------------------
  |  |   28|  50.8k|  do {                                          \
  |  |   29|  50.8k|    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
  |  |   30|  50.8k|    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
  |  |   31|  50.8k|    const __m128i _in = in;                     \
  |  |   32|  50.8k|    out0 = _mm_mulhrs_epi16(_in, _w0);          \
  |  |   33|  50.8k|    out1 = _mm_mulhrs_epi16(_in, _w1);          \
  |  |   34|  50.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (34:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  758|  50.8k|  btf_16_adds_subs_sse2(x[4], x[5]);
  ------------------
  |  |   37|  50.8k|  do {                                  \
  |  |   38|  50.8k|    const __m128i _in0 = in0;           \
  |  |   39|  50.8k|    const __m128i _in1 = in1;           \
  |  |   40|  50.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  50.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  50.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  759|  50.8k|  btf_16_subs_adds_sse2(x[7], x[6]);
  ------------------
  |  |   45|  50.8k|  do {                                  \
  |  |   46|  50.8k|    const __m128i _in0 = in0;           \
  |  |   47|  50.8k|    const __m128i _in1 = in1;           \
  |  |   48|  50.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  50.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  50.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  760|  50.8k|  idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
  761|       |
  762|  50.8k|  btf_16_adds_subs_sse2(x[0], x[3]);
  ------------------
  |  |   37|  50.8k|  do {                                  \
  |  |   38|  50.8k|    const __m128i _in0 = in0;           \
  |  |   39|  50.8k|    const __m128i _in1 = in1;           \
  |  |   40|  50.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  50.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  50.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  763|  50.8k|  btf_16_adds_subs_sse2(x[1], x[2]);
  ------------------
  |  |   37|  50.8k|  do {                                  \
  |  |   38|  50.8k|    const __m128i _in0 = in0;           \
  |  |   39|  50.8k|    const __m128i _in1 = in1;           \
  |  |   40|  50.8k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  50.8k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  50.8k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  764|  50.8k|  idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
  765|       |
  766|  50.8k|  idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
  767|  50.8k|  idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
  768|  50.8k|  idct32_stage9_sse2(output, x);
  769|  50.8k|}
av1_inv_txfm_ssse3.c:idct32_high16_stage3_sse2:
  475|  66.9k|static inline void idct32_high16_stage3_sse2(__m128i *x) {
  476|  66.9k|  btf_16_adds_subs_sse2(x[16], x[17]);
  ------------------
  |  |   37|  66.9k|  do {                                  \
  |  |   38|  66.9k|    const __m128i _in0 = in0;           \
  |  |   39|  66.9k|    const __m128i _in1 = in1;           \
  |  |   40|  66.9k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  66.9k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  66.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  477|  66.9k|  btf_16_subs_adds_sse2(x[19], x[18]);
  ------------------
  |  |   45|  66.9k|  do {                                  \
  |  |   46|  66.9k|    const __m128i _in0 = in0;           \
  |  |   47|  66.9k|    const __m128i _in1 = in1;           \
  |  |   48|  66.9k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  66.9k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  66.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  478|  66.9k|  btf_16_adds_subs_sse2(x[20], x[21]);
  ------------------
  |  |   37|  66.9k|  do {                                  \
  |  |   38|  66.9k|    const __m128i _in0 = in0;           \
  |  |   39|  66.9k|    const __m128i _in1 = in1;           \
  |  |   40|  66.9k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  66.9k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  66.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  479|  66.9k|  btf_16_subs_adds_sse2(x[23], x[22]);
  ------------------
  |  |   45|  66.9k|  do {                                  \
  |  |   46|  66.9k|    const __m128i _in0 = in0;           \
  |  |   47|  66.9k|    const __m128i _in1 = in1;           \
  |  |   48|  66.9k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  66.9k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  66.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  480|  66.9k|  btf_16_adds_subs_sse2(x[24], x[25]);
  ------------------
  |  |   37|  66.9k|  do {                                  \
  |  |   38|  66.9k|    const __m128i _in0 = in0;           \
  |  |   39|  66.9k|    const __m128i _in1 = in1;           \
  |  |   40|  66.9k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  66.9k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  66.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  481|  66.9k|  btf_16_subs_adds_sse2(x[27], x[26]);
  ------------------
  |  |   45|  66.9k|  do {                                  \
  |  |   46|  66.9k|    const __m128i _in0 = in0;           \
  |  |   47|  66.9k|    const __m128i _in1 = in1;           \
  |  |   48|  66.9k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  66.9k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  66.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  482|  66.9k|  btf_16_adds_subs_sse2(x[28], x[29]);
  ------------------
  |  |   37|  66.9k|  do {                                  \
  |  |   38|  66.9k|    const __m128i _in0 = in0;           \
  |  |   39|  66.9k|    const __m128i _in1 = in1;           \
  |  |   40|  66.9k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  66.9k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  66.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  483|  66.9k|  btf_16_subs_adds_sse2(x[31], x[30]);
  ------------------
  |  |   45|  66.9k|  do {                                  \
  |  |   46|  66.9k|    const __m128i _in0 = in0;           \
  |  |   47|  66.9k|    const __m128i _in1 = in1;           \
  |  |   48|  66.9k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  66.9k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  66.9k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  484|  66.9k|}
av1_inv_txfm_ssse3.c:idct32_sse2:
  771|  16.1k|static void idct32_sse2(const __m128i *input, __m128i *output) {
  772|  16.1k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|  16.1k|#define INV_COS_BIT 12
  ------------------
  773|  16.1k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|  16.1k|#define INV_COS_BIT 12
  ------------------
  774|  16.1k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|  16.1k|#define INV_COS_BIT 12
  ------------------
  775|       |
  776|  16.1k|  const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  777|  16.1k|  const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  778|  16.1k|  const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  779|  16.1k|  const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  780|  16.1k|  const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  781|  16.1k|  const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  782|  16.1k|  const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  783|  16.1k|  const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  784|  16.1k|  const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  785|  16.1k|  const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  786|  16.1k|  const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  787|  16.1k|  const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  788|  16.1k|  const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  789|  16.1k|  const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  790|  16.1k|  const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  791|  16.1k|  const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  792|  16.1k|  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  793|  16.1k|  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  794|  16.1k|  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  795|  16.1k|  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  796|  16.1k|  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  797|  16.1k|  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  798|  16.1k|  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  799|  16.1k|  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  800|  16.1k|  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  801|  16.1k|  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  802|  16.1k|  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  803|  16.1k|  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  804|  16.1k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  805|  16.1k|  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  806|  16.1k|  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  807|  16.1k|  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
  ------------------
  |  |   20|  16.1k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  808|       |
  809|       |  // stage 1
  810|  16.1k|  __m128i x[32];
  811|  16.1k|  x[0] = input[0];
  812|  16.1k|  x[1] = input[16];
  813|  16.1k|  x[2] = input[8];
  814|  16.1k|  x[3] = input[24];
  815|  16.1k|  x[4] = input[4];
  816|  16.1k|  x[5] = input[20];
  817|  16.1k|  x[6] = input[12];
  818|  16.1k|  x[7] = input[28];
  819|  16.1k|  x[8] = input[2];
  820|  16.1k|  x[9] = input[18];
  821|  16.1k|  x[10] = input[10];
  822|  16.1k|  x[11] = input[26];
  823|  16.1k|  x[12] = input[6];
  824|  16.1k|  x[13] = input[22];
  825|  16.1k|  x[14] = input[14];
  826|  16.1k|  x[15] = input[30];
  827|  16.1k|  x[16] = input[1];
  828|  16.1k|  x[17] = input[17];
  829|  16.1k|  x[18] = input[9];
  830|  16.1k|  x[19] = input[25];
  831|  16.1k|  x[20] = input[5];
  832|  16.1k|  x[21] = input[21];
  833|  16.1k|  x[22] = input[13];
  834|  16.1k|  x[23] = input[29];
  835|  16.1k|  x[24] = input[3];
  836|  16.1k|  x[25] = input[19];
  837|  16.1k|  x[26] = input[11];
  838|  16.1k|  x[27] = input[27];
  839|  16.1k|  x[28] = input[7];
  840|  16.1k|  x[29] = input[23];
  841|  16.1k|  x[30] = input[15];
  842|  16.1k|  x[31] = input[31];
  843|       |
  844|       |  // stage 2
  845|  16.1k|  btf_16_sse2(cospi_p62_m02, cospi_p02_p62, x[16], x[31], x[16], x[31]);
  ------------------
  |  |   61|  16.1k|  do {                                            \
  |  |   62|  16.1k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  16.1k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  16.1k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  16.1k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  16.1k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  16.1k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  16.1k|                                                  \
  |  |   69|  16.1k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  16.1k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  16.1k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  16.1k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  16.1k|                                                  \
  |  |   74|  16.1k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  16.1k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  16.1k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  16.1k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  16.1k|                                                  \
  |  |   79|  16.1k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  16.1k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  16.1k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  846|  16.1k|  btf_16_sse2(cospi_p30_m34, cospi_p34_p30, x[17], x[30], x[17], x[30]);
  ------------------
  |  |   61|  16.1k|  do {                                            \
  |  |   62|  16.1k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  16.1k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  16.1k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  16.1k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  16.1k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  16.1k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  16.1k|                                                  \
  |  |   69|  16.1k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  16.1k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  16.1k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  16.1k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  16.1k|                                                  \
  |  |   74|  16.1k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  16.1k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  16.1k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  16.1k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  16.1k|                                                  \
  |  |   79|  16.1k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  16.1k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  16.1k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  847|  16.1k|  btf_16_sse2(cospi_p46_m18, cospi_p18_p46, x[18], x[29], x[18], x[29]);
  ------------------
  |  |   61|  16.1k|  do {                                            \
  |  |   62|  16.1k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  16.1k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  16.1k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  16.1k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  16.1k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  16.1k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  16.1k|                                                  \
  |  |   69|  16.1k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  16.1k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  16.1k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  16.1k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  16.1k|                                                  \
  |  |   74|  16.1k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  16.1k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  16.1k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  16.1k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  16.1k|                                                  \
  |  |   79|  16.1k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  16.1k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  16.1k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  848|  16.1k|  btf_16_sse2(cospi_p14_m50, cospi_p50_p14, x[19], x[28], x[19], x[28]);
  ------------------
  |  |   61|  16.1k|  do {                                            \
  |  |   62|  16.1k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  16.1k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  16.1k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  16.1k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  16.1k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  16.1k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  16.1k|                                                  \
  |  |   69|  16.1k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  16.1k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  16.1k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  16.1k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  16.1k|                                                  \
  |  |   74|  16.1k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  16.1k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  16.1k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  16.1k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  16.1k|                                                  \
  |  |   79|  16.1k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  16.1k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  16.1k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  849|  16.1k|  btf_16_sse2(cospi_p54_m10, cospi_p10_p54, x[20], x[27], x[20], x[27]);
  ------------------
  |  |   61|  16.1k|  do {                                            \
  |  |   62|  16.1k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  16.1k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  16.1k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  16.1k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  16.1k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  16.1k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  16.1k|                                                  \
  |  |   69|  16.1k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  16.1k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  16.1k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  16.1k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  16.1k|                                                  \
  |  |   74|  16.1k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  16.1k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  16.1k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  16.1k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  16.1k|                                                  \
  |  |   79|  16.1k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  16.1k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  16.1k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  850|  16.1k|  btf_16_sse2(cospi_p22_m42, cospi_p42_p22, x[21], x[26], x[21], x[26]);
  ------------------
  |  |   61|  16.1k|  do {                                            \
  |  |   62|  16.1k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  16.1k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  16.1k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  16.1k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  16.1k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  16.1k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  16.1k|                                                  \
  |  |   69|  16.1k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  16.1k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  16.1k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  16.1k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  16.1k|                                                  \
  |  |   74|  16.1k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  16.1k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  16.1k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  16.1k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  16.1k|                                                  \
  |  |   79|  16.1k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  16.1k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  16.1k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  851|  16.1k|  btf_16_sse2(cospi_p38_m26, cospi_p26_p38, x[22], x[25], x[22], x[25]);
  ------------------
  |  |   61|  16.1k|  do {                                            \
  |  |   62|  16.1k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  16.1k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  16.1k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  16.1k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  16.1k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  16.1k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  16.1k|                                                  \
  |  |   69|  16.1k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  16.1k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  16.1k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  16.1k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  16.1k|                                                  \
  |  |   74|  16.1k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  16.1k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  16.1k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  16.1k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  16.1k|                                                  \
  |  |   79|  16.1k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  16.1k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  16.1k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  852|  16.1k|  btf_16_sse2(cospi_p06_m58, cospi_p58_p06, x[23], x[24], x[23], x[24]);
  ------------------
  |  |   61|  16.1k|  do {                                            \
  |  |   62|  16.1k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  16.1k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  16.1k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  16.1k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  16.1k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  16.1k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  16.1k|                                                  \
  |  |   69|  16.1k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  16.1k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  16.1k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  16.1k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  16.1k|                                                  \
  |  |   74|  16.1k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  16.1k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  16.1k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  16.1k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  16.1k|                                                  \
  |  |   79|  16.1k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  16.1k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  16.1k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  853|       |
  854|       |  // stage 3
  855|  16.1k|  btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
  ------------------
  |  |   61|  16.1k|  do {                                            \
  |  |   62|  16.1k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  16.1k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  16.1k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  16.1k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  16.1k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  16.1k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  16.1k|                                                  \
  |  |   69|  16.1k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  16.1k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  16.1k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  16.1k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  16.1k|                                                  \
  |  |   74|  16.1k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  16.1k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  16.1k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  16.1k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  16.1k|                                                  \
  |  |   79|  16.1k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  16.1k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  16.1k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  856|  16.1k|  btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
  ------------------
  |  |   61|  16.1k|  do {                                            \
  |  |   62|  16.1k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  16.1k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  16.1k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  16.1k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  16.1k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  16.1k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  16.1k|                                                  \
  |  |   69|  16.1k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  16.1k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  16.1k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  16.1k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  16.1k|                                                  \
  |  |   74|  16.1k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  16.1k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  16.1k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  16.1k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  16.1k|                                                  \
  |  |   79|  16.1k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  16.1k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  16.1k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  857|  16.1k|  btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
  ------------------
  |  |   61|  16.1k|  do {                                            \
  |  |   62|  16.1k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  16.1k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  16.1k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  16.1k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  16.1k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  16.1k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  16.1k|                                                  \
  |  |   69|  16.1k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  16.1k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  16.1k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  16.1k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  16.1k|                                                  \
  |  |   74|  16.1k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  16.1k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  16.1k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  16.1k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  16.1k|                                                  \
  |  |   79|  16.1k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  16.1k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  16.1k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  858|  16.1k|  btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
  ------------------
  |  |   61|  16.1k|  do {                                            \
  |  |   62|  16.1k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  16.1k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  16.1k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  16.1k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  16.1k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  16.1k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  16.1k|                                                  \
  |  |   69|  16.1k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  16.1k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  16.1k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  16.1k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  16.1k|                                                  \
  |  |   74|  16.1k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  16.1k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  16.1k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  16.1k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  16.1k|                                                  \
  |  |   79|  16.1k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  16.1k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  16.1k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  859|  16.1k|  idct32_high16_stage3_sse2(x);
  860|       |
  861|       |  // stage 4
  862|  16.1k|  btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
  ------------------
  |  |   61|  16.1k|  do {                                            \
  |  |   62|  16.1k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  16.1k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  16.1k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  16.1k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  16.1k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  16.1k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  16.1k|                                                  \
  |  |   69|  16.1k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  16.1k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  16.1k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  16.1k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  16.1k|                                                  \
  |  |   74|  16.1k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  16.1k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  16.1k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  16.1k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  16.1k|                                                  \
  |  |   79|  16.1k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  16.1k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  16.1k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  863|  16.1k|  btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
  ------------------
  |  |   61|  16.1k|  do {                                            \
  |  |   62|  16.1k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  16.1k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  16.1k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  16.1k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  16.1k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  16.1k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  16.1k|                                                  \
  |  |   69|  16.1k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  16.1k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  16.1k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  16.1k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  16.1k|                                                  \
  |  |   74|  16.1k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  16.1k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  16.1k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  16.1k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  16.1k|                                                  \
  |  |   79|  16.1k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  16.1k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  16.1k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  864|  16.1k|  btf_16_adds_subs_sse2(x[8], x[9]);
  ------------------
  |  |   37|  16.1k|  do {                                  \
  |  |   38|  16.1k|    const __m128i _in0 = in0;           \
  |  |   39|  16.1k|    const __m128i _in1 = in1;           \
  |  |   40|  16.1k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  16.1k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  16.1k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  865|  16.1k|  btf_16_subs_adds_sse2(x[11], x[10]);
  ------------------
  |  |   45|  16.1k|  do {                                  \
  |  |   46|  16.1k|    const __m128i _in0 = in0;           \
  |  |   47|  16.1k|    const __m128i _in1 = in1;           \
  |  |   48|  16.1k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  16.1k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  16.1k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  866|  16.1k|  btf_16_adds_subs_sse2(x[12], x[13]);
  ------------------
  |  |   37|  16.1k|  do {                                  \
  |  |   38|  16.1k|    const __m128i _in0 = in0;           \
  |  |   39|  16.1k|    const __m128i _in1 = in1;           \
  |  |   40|  16.1k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  16.1k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  16.1k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  867|  16.1k|  btf_16_subs_adds_sse2(x[15], x[14]);
  ------------------
  |  |   45|  16.1k|  do {                                  \
  |  |   46|  16.1k|    const __m128i _in0 = in0;           \
  |  |   47|  16.1k|    const __m128i _in1 = in1;           \
  |  |   48|  16.1k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|  16.1k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|  16.1k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  868|  16.1k|  idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
  869|       |
  870|       |  // stage 5
  871|  16.1k|  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
  ------------------
  |  |   61|  16.1k|  do {                                            \
  |  |   62|  16.1k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  16.1k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  16.1k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  16.1k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  16.1k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  16.1k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  16.1k|                                                  \
  |  |   69|  16.1k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  16.1k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  16.1k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  16.1k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  16.1k|                                                  \
  |  |   74|  16.1k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  16.1k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  16.1k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  16.1k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  16.1k|                                                  \
  |  |   79|  16.1k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  16.1k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  16.1k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  872|  16.1k|  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   61|  16.1k|  do {                                            \
  |  |   62|  16.1k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
  |  |   63|  16.1k|    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
  |  |   64|  16.1k|    __m128i u0 = _mm_madd_epi16(t0, w0);          \
  |  |   65|  16.1k|    __m128i u1 = _mm_madd_epi16(t1, w0);          \
  |  |   66|  16.1k|    __m128i v0 = _mm_madd_epi16(t0, w1);          \
  |  |   67|  16.1k|    __m128i v1 = _mm_madd_epi16(t1, w1);          \
  |  |   68|  16.1k|                                                  \
  |  |   69|  16.1k|    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
  |  |   70|  16.1k|    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
  |  |   71|  16.1k|    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
  |  |   72|  16.1k|    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
  |  |   73|  16.1k|                                                  \
  |  |   74|  16.1k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
  |  |   75|  16.1k|    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
  |  |   76|  16.1k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
  |  |   77|  16.1k|    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
  |  |   78|  16.1k|                                                  \
  |  |   79|  16.1k|    out0 = _mm_packs_epi32(c0, c1);               \
  |  |   80|  16.1k|    out1 = _mm_packs_epi32(d0, d1);               \
  |  |   81|  16.1k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (81:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  873|  16.1k|  btf_16_adds_subs_sse2(x[4], x[5]);
  ------------------
  |  |   37|  16.1k|  do {                                  \
  |  |   38|  16.1k|    const __m128i _in0 = in0;           \
  |  |   39|  16.1k|    const __m128i _in1 = in1;           \
  |  |   40|  16.1k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  16.1k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  16.1k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  874|  16.1k|  btf_16_adds_subs_sse2(x[7], x[6]);
  ------------------
  |  |   37|  16.1k|  do {                                  \
  |  |   38|  16.1k|    const __m128i _in0 = in0;           \
  |  |   39|  16.1k|    const __m128i _in1 = in1;           \
  |  |   40|  16.1k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  16.1k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  16.1k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  875|  16.1k|  idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
  876|       |
  877|       |  // stage 6
  878|  16.1k|  btf_16_adds_subs_sse2(x[0], x[3]);
  ------------------
  |  |   37|  16.1k|  do {                                  \
  |  |   38|  16.1k|    const __m128i _in0 = in0;           \
  |  |   39|  16.1k|    const __m128i _in1 = in1;           \
  |  |   40|  16.1k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  16.1k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  16.1k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  879|  16.1k|  btf_16_adds_subs_sse2(x[1], x[2]);
  ------------------
  |  |   37|  16.1k|  do {                                  \
  |  |   38|  16.1k|    const __m128i _in0 = in0;           \
  |  |   39|  16.1k|    const __m128i _in1 = in1;           \
  |  |   40|  16.1k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|  16.1k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|  16.1k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  880|  16.1k|  idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
  881|       |
  882|       |  // stage 7~8
  883|  16.1k|  idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
  884|  16.1k|  idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
  885|  16.1k|  idct32_stage9_sse2(output, x);
  886|  16.1k|}
av1_inv_txfm_ssse3.c:lowbd_get_recon_8x8_sse2:
 2236|  12.6M|                                               __m128i res) {
 2237|  12.6M|  const __m128i zero = _mm_setzero_si128();
 2238|  12.6M|  __m128i x0 = _mm_adds_epi16(res, _mm_unpacklo_epi8(pred, zero));
 2239|  12.6M|  return _mm_packus_epi16(x0, x0);
 2240|  12.6M|}
av1_inv_txfm_ssse3.c:round_shift_ssse3:
 2467|  1.58M|                                     int size) {
 2468|  1.58M|  const __m128i scale = _mm_set1_epi16(NewInvSqrt2 * 8);
 2469|  13.1M|  for (int i = 0; i < size; ++i) {
  ------------------
  |  Branch (2469:19): [True: 11.5M, False: 1.58M]
  ------------------
 2470|  11.5M|    output[i] = _mm_mulhrs_epi16(input[i], scale);
 2471|  11.5M|  }
 2472|  1.58M|}
av1_inv_txfm_ssse3.c:lowbd_inv_txfm2d_add_4x4_ssse3:
 2412|   588k|                                           int eob) {
 2413|   588k|  (void)tx_size_;
 2414|   588k|  (void)eob;
 2415|   588k|  __m128i buf[4];
 2416|   588k|  const TX_SIZE tx_size = TX_4X4;
 2417|   588k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 2418|   588k|  const int txw_idx = get_txw_idx(tx_size);
 2419|   588k|  const int txh_idx = get_txh_idx(tx_size);
 2420|   588k|  const int txfm_size_col = tx_size_wide[tx_size];
 2421|   588k|  const int txfm_size_row = tx_size_high[tx_size];
 2422|       |
 2423|   588k|  const transform_1d_ssse3 row_txfm =
 2424|   588k|      lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
 2425|   588k|  const transform_1d_ssse3 col_txfm =
 2426|   588k|      lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
 2427|       |
 2428|   588k|  int ud_flip, lr_flip;
 2429|   588k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 2430|   588k|  load_buffer_32bit_to_16bit_w4(input, txfm_size_row, buf, txfm_size_col);
 2431|   588k|  row_txfm(buf, buf);
 2432|   588k|  if (lr_flip) {
  ------------------
  |  Branch (2432:7): [True: 36.7k, False: 551k]
  ------------------
 2433|  36.7k|    __m128i temp[4];
 2434|  36.7k|    flip_buf_sse2(buf, temp, txfm_size_col);
 2435|  36.7k|    transpose_16bit_4x4(temp, buf);
 2436|   551k|  } else {
 2437|   551k|    transpose_16bit_4x4(buf, buf);
 2438|   551k|  }
 2439|   588k|  col_txfm(buf, buf);
 2440|   588k|  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
 2441|   588k|  lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
 2442|   588k|}
av1_inv_txfm_ssse3.c:idct4_w4_sse2:
   53|   561k|static void idct4_w4_sse2(const __m128i *input, __m128i *output) {
   54|   561k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|   561k|#define INV_COS_BIT 12
  ------------------
   55|   561k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   561k|#define INV_COS_BIT 12
  ------------------
   56|   561k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|   561k|#define INV_COS_BIT 12
  ------------------
   57|       |
   58|   561k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|   561k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
   59|   561k|  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
  ------------------
  |  |   20|   561k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
   60|   561k|  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
  ------------------
  |  |   20|   561k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
   61|   561k|  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
  ------------------
  |  |   20|   561k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
   62|       |
   63|       |  // stage 1
   64|   561k|  __m128i x[4];
   65|   561k|  x[0] = input[0];
   66|   561k|  x[1] = input[2];
   67|   561k|  x[2] = input[1];
   68|   561k|  x[3] = input[3];
   69|       |
   70|       |  // stage 2
   71|   561k|  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
  ------------------
  |  |   45|   561k|  do {                                               \
  |  |   46|   561k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   561k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   561k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   561k|                                                     \
  |  |   50|   561k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   561k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   561k|                                                     \
  |  |   53|   561k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   561k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   561k|                                                     \
  |  |   56|   561k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   561k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   561k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
   72|   561k|  btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   45|   561k|  do {                                               \
  |  |   46|   561k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   561k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   561k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   561k|                                                     \
  |  |   50|   561k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   561k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   561k|                                                     \
  |  |   53|   561k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   561k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   561k|                                                     \
  |  |   56|   561k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   561k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   561k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
   73|       |
   74|       |  // stage 3
   75|   561k|  btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
  ------------------
  |  |   53|   561k|  do {                                                  \
  |  |   54|   561k|    const __m128i _in0 = in0;                           \
  |  |   55|   561k|    const __m128i _in1 = in1;                           \
  |  |   56|   561k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   561k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   561k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
   76|   561k|  btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
  ------------------
  |  |   53|   561k|  do {                                                  \
  |  |   54|   561k|    const __m128i _in0 = in0;                           \
  |  |   55|   561k|    const __m128i _in1 = in1;                           \
  |  |   56|   561k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   561k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   561k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
   77|   561k|}
av1_inv_txfm_ssse3.c:iadst4_w4_sse2:
 1657|   372k|static void iadst4_w4_sse2(const __m128i *input, __m128i *output) {
 1658|   372k|  const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   372k|#define INV_COS_BIT 12
  ------------------
 1659|   372k|  const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
  ------------------
  |  |   20|   372k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1660|   372k|  const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
  ------------------
  |  |   20|   372k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1661|   372k|  const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
  ------------------
  |  |   20|   372k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1662|   372k|  const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
  ------------------
  |  |   20|   372k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1663|   372k|  const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
  ------------------
  |  |   20|   372k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1664|   372k|  const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
  ------------------
  |  |   20|   372k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1665|   372k|  const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
  ------------------
  |  |   20|   372k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1666|   372k|  const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
  ------------------
  |  |   20|   372k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1667|   372k|  __m128i x0[4];
 1668|   372k|  x0[0] = input[0];
 1669|   372k|  x0[1] = input[1];
 1670|   372k|  x0[2] = input[2];
 1671|   372k|  x0[3] = input[3];
 1672|       |
 1673|   372k|  __m128i u[2];
 1674|   372k|  u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
 1675|   372k|  u[1] = _mm_unpacklo_epi16(x0[1], x0[3]);
 1676|       |
 1677|   372k|  __m128i x1[8];
 1678|   372k|  x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04);  // x0*sin1 + x2*sin4
 1679|   372k|  x1[1] = _mm_madd_epi16(u[0], sinpi_p02_m01);  // x0*sin2 - x2*sin1
 1680|   372k|  x1[2] = _mm_madd_epi16(u[1], sinpi_p03_p02);  // x1*sin3 + x3*sin2
 1681|   372k|  x1[3] = _mm_madd_epi16(u[1], sinpi_p03_m04);  // x1*sin3 - x3*sin4
 1682|   372k|  x1[4] = _mm_madd_epi16(u[0], sinpi_p03_m03);  // x0*sin3 - x2*sin3
 1683|   372k|  x1[5] = _mm_madd_epi16(u[1], sinpi_0_p03);    // x2*sin3
 1684|   372k|  x1[6] = _mm_madd_epi16(u[0], sinpi_p04_p02);  // x0*sin4 + x2*sin2
 1685|   372k|  x1[7] = _mm_madd_epi16(u[1], sinpi_m03_m01);  // -x1*sin3 - x3*sin1
 1686|       |
 1687|   372k|  __m128i x2[4];
 1688|   372k|  x2[0] = _mm_add_epi32(x1[0], x1[2]);  // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2
 1689|   372k|  x2[1] = _mm_add_epi32(x1[1], x1[3]);  // x0*sin2 - x2*sin1 + x1*sin3 - x3*sin4
 1690|   372k|  x2[2] = _mm_add_epi32(x1[4], x1[5]);  // x0*sin3 - x2*sin3 + x3*sin3
 1691|   372k|  x2[3] = _mm_add_epi32(x1[6], x1[7]);  // x0*sin4 + x2*sin2 - x1*sin3 - x3*sin1
 1692|       |
 1693|   372k|  const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|   372k|#define INV_COS_BIT 12
  ------------------
 1694|  1.86M|  for (int i = 0; i < 4; ++i) {
  ------------------
  |  Branch (1694:19): [True: 1.48M, False: 372k]
  ------------------
 1695|  1.48M|    __m128i out0 = _mm_add_epi32(x2[i], rounding);
 1696|  1.48M|    out0 = _mm_srai_epi32(out0, INV_COS_BIT);
  ------------------
  |  |   43|  1.48M|#define INV_COS_BIT 12
  ------------------
 1697|  1.48M|    output[i] = _mm_packs_epi32(out0, out0);
 1698|  1.48M|  }
 1699|   372k|}
av1_inv_txfm_ssse3.c:idct8_w4_sse2:
  150|   461k|static void idct8_w4_sse2(const __m128i *input, __m128i *output) {
  151|   461k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|   461k|#define INV_COS_BIT 12
  ------------------
  152|   461k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   461k|#define INV_COS_BIT 12
  ------------------
  153|   461k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|   461k|#define INV_COS_BIT 12
  ------------------
  154|       |
  155|   461k|  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
  ------------------
  |  |   20|   461k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  156|   461k|  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
  ------------------
  |  |   20|   461k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  157|   461k|  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
  ------------------
  |  |   20|   461k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  158|   461k|  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
  ------------------
  |  |   20|   461k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  159|   461k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|   461k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  160|   461k|  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
  ------------------
  |  |   20|   461k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  161|   461k|  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
  ------------------
  |  |   20|   461k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  162|   461k|  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
  ------------------
  |  |   20|   461k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  163|   461k|  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
  ------------------
  |  |   20|   461k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  164|       |
  165|       |  // stage 1
  166|   461k|  __m128i x[8];
  167|   461k|  x[0] = input[0];
  168|   461k|  x[1] = input[4];
  169|   461k|  x[2] = input[2];
  170|   461k|  x[3] = input[6];
  171|   461k|  x[4] = input[1];
  172|   461k|  x[5] = input[5];
  173|   461k|  x[6] = input[3];
  174|   461k|  x[7] = input[7];
  175|       |
  176|       |  // stage 2
  177|   461k|  btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
  ------------------
  |  |   45|   461k|  do {                                               \
  |  |   46|   461k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   461k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   461k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   461k|                                                     \
  |  |   50|   461k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   461k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   461k|                                                     \
  |  |   53|   461k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   461k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   461k|                                                     \
  |  |   56|   461k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   461k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   461k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  178|   461k|  btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
  ------------------
  |  |   45|   461k|  do {                                               \
  |  |   46|   461k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   461k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   461k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   461k|                                                     \
  |  |   50|   461k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   461k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   461k|                                                     \
  |  |   53|   461k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   461k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   461k|                                                     \
  |  |   56|   461k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   461k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   461k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  179|       |
  180|       |  // stage 3
  181|   461k|  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
  ------------------
  |  |   45|   461k|  do {                                               \
  |  |   46|   461k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   461k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   461k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   461k|                                                     \
  |  |   50|   461k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   461k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   461k|                                                     \
  |  |   53|   461k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   461k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   461k|                                                     \
  |  |   56|   461k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   461k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   461k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  182|   461k|  btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   45|   461k|  do {                                               \
  |  |   46|   461k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   461k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   461k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   461k|                                                     \
  |  |   50|   461k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   461k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   461k|                                                     \
  |  |   53|   461k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   461k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   461k|                                                     \
  |  |   56|   461k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   461k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   461k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  183|   461k|  btf_16_adds_subs_sse2(x[4], x[5]);
  ------------------
  |  |   37|   461k|  do {                                  \
  |  |   38|   461k|    const __m128i _in0 = in0;           \
  |  |   39|   461k|    const __m128i _in1 = in1;           \
  |  |   40|   461k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   461k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   461k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  184|   461k|  btf_16_subs_adds_sse2(x[7], x[6]);
  ------------------
  |  |   45|   461k|  do {                                  \
  |  |   46|   461k|    const __m128i _in0 = in0;           \
  |  |   47|   461k|    const __m128i _in1 = in1;           \
  |  |   48|   461k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|   461k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|   461k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  185|       |
  186|       |  // stage 4
  187|   461k|  btf_16_adds_subs_sse2(x[0], x[3]);
  ------------------
  |  |   37|   461k|  do {                                  \
  |  |   38|   461k|    const __m128i _in0 = in0;           \
  |  |   39|   461k|    const __m128i _in1 = in1;           \
  |  |   40|   461k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   461k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   461k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  188|   461k|  btf_16_adds_subs_sse2(x[1], x[2]);
  ------------------
  |  |   37|   461k|  do {                                  \
  |  |   38|   461k|    const __m128i _in0 = in0;           \
  |  |   39|   461k|    const __m128i _in1 = in1;           \
  |  |   40|   461k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   461k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   461k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  189|   461k|  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
  ------------------
  |  |   45|   461k|  do {                                               \
  |  |   46|   461k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   461k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   461k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   461k|                                                     \
  |  |   50|   461k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   461k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   461k|                                                     \
  |  |   53|   461k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   461k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   461k|                                                     \
  |  |   56|   461k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   461k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   461k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  190|       |
  191|       |  // stage 5
  192|   461k|  btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
  ------------------
  |  |   53|   461k|  do {                                                  \
  |  |   54|   461k|    const __m128i _in0 = in0;                           \
  |  |   55|   461k|    const __m128i _in1 = in1;                           \
  |  |   56|   461k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   461k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   461k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  193|   461k|  btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
  ------------------
  |  |   53|   461k|  do {                                                  \
  |  |   54|   461k|    const __m128i _in0 = in0;                           \
  |  |   55|   461k|    const __m128i _in1 = in1;                           \
  |  |   56|   461k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   461k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   461k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  194|   461k|  btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
  ------------------
  |  |   53|   461k|  do {                                                  \
  |  |   54|   461k|    const __m128i _in0 = in0;                           \
  |  |   55|   461k|    const __m128i _in1 = in1;                           \
  |  |   56|   461k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   461k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   461k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  195|   461k|  btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
  ------------------
  |  |   53|   461k|  do {                                                  \
  |  |   54|   461k|    const __m128i _in0 = in0;                           \
  |  |   55|   461k|    const __m128i _in1 = in1;                           \
  |  |   56|   461k|    out0 = _mm_adds_epi16(_in0, _in1);                  \
  |  |   57|   461k|    out1 = _mm_subs_epi16(_in0, _in1);                  \
  |  |   58|   461k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  196|   461k|}
av1_inv_txfm_ssse3.c:iadst8_w4_sse2:
 1815|   313k|static void iadst8_w4_sse2(const __m128i *input, __m128i *output) {
 1816|   313k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|   313k|#define INV_COS_BIT 12
  ------------------
 1817|   313k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   313k|#define INV_COS_BIT 12
  ------------------
 1818|   313k|  const __m128i __zero = _mm_setzero_si128();
 1819|   313k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|   313k|#define INV_COS_BIT 12
  ------------------
 1820|       |
 1821|   313k|  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
  ------------------
  |  |   20|   313k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1822|   313k|  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
  ------------------
  |  |   20|   313k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1823|   313k|  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
  ------------------
  |  |   20|   313k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1824|   313k|  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
  ------------------
  |  |   20|   313k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1825|   313k|  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
  ------------------
  |  |   20|   313k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1826|   313k|  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
  ------------------
  |  |   20|   313k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1827|   313k|  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
  ------------------
  |  |   20|   313k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1828|   313k|  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
  ------------------
  |  |   20|   313k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1829|   313k|  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
  ------------------
  |  |   20|   313k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1830|   313k|  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
  ------------------
  |  |   20|   313k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1831|   313k|  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
  ------------------
  |  |   20|   313k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1832|   313k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|   313k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1833|   313k|  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
  ------------------
  |  |   20|   313k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 1834|       |
 1835|       |  // stage 1
 1836|   313k|  __m128i x[8];
 1837|   313k|  x[0] = input[7];
 1838|   313k|  x[1] = input[0];
 1839|   313k|  x[2] = input[5];
 1840|   313k|  x[3] = input[2];
 1841|   313k|  x[4] = input[3];
 1842|   313k|  x[5] = input[4];
 1843|   313k|  x[6] = input[1];
 1844|   313k|  x[7] = input[6];
 1845|       |
 1846|       |  // stage 2
 1847|   313k|  btf_16_4p_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
  ------------------
  |  |   45|   313k|  do {                                               \
  |  |   46|   313k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   313k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   313k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   313k|                                                     \
  |  |   50|   313k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   313k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   313k|                                                     \
  |  |   53|   313k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   313k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   313k|                                                     \
  |  |   56|   313k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   313k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   313k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1848|   313k|  btf_16_4p_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   45|   313k|  do {                                               \
  |  |   46|   313k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   313k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   313k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   313k|                                                     \
  |  |   50|   313k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   313k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   313k|                                                     \
  |  |   53|   313k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   313k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   313k|                                                     \
  |  |   56|   313k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   313k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   313k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1849|   313k|  btf_16_4p_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
  ------------------
  |  |   45|   313k|  do {                                               \
  |  |   46|   313k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   313k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   313k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   313k|                                                     \
  |  |   50|   313k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   313k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   313k|                                                     \
  |  |   53|   313k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   313k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   313k|                                                     \
  |  |   56|   313k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   313k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   313k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1850|   313k|  btf_16_4p_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
  ------------------
  |  |   45|   313k|  do {                                               \
  |  |   46|   313k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   313k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   313k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   313k|                                                     \
  |  |   50|   313k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   313k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   313k|                                                     \
  |  |   53|   313k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   313k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   313k|                                                     \
  |  |   56|   313k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   313k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   313k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1851|       |
 1852|       |  // stage 3
 1853|   313k|  btf_16_adds_subs_sse2(x[0], x[4]);
  ------------------
  |  |   37|   313k|  do {                                  \
  |  |   38|   313k|    const __m128i _in0 = in0;           \
  |  |   39|   313k|    const __m128i _in1 = in1;           \
  |  |   40|   313k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   313k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   313k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1854|   313k|  btf_16_adds_subs_sse2(x[1], x[5]);
  ------------------
  |  |   37|   313k|  do {                                  \
  |  |   38|   313k|    const __m128i _in0 = in0;           \
  |  |   39|   313k|    const __m128i _in1 = in1;           \
  |  |   40|   313k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   313k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   313k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1855|   313k|  btf_16_adds_subs_sse2(x[2], x[6]);
  ------------------
  |  |   37|   313k|  do {                                  \
  |  |   38|   313k|    const __m128i _in0 = in0;           \
  |  |   39|   313k|    const __m128i _in1 = in1;           \
  |  |   40|   313k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   313k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   313k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1856|   313k|  btf_16_adds_subs_sse2(x[3], x[7]);
  ------------------
  |  |   37|   313k|  do {                                  \
  |  |   38|   313k|    const __m128i _in0 = in0;           \
  |  |   39|   313k|    const __m128i _in1 = in1;           \
  |  |   40|   313k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   313k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   313k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1857|       |
 1858|       |  // stage 4
 1859|   313k|  btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
  ------------------
  |  |   45|   313k|  do {                                               \
  |  |   46|   313k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   313k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   313k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   313k|                                                     \
  |  |   50|   313k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   313k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   313k|                                                     \
  |  |   53|   313k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   313k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   313k|                                                     \
  |  |   56|   313k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   313k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   313k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1860|   313k|  btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
  ------------------
  |  |   45|   313k|  do {                                               \
  |  |   46|   313k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   313k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   313k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   313k|                                                     \
  |  |   50|   313k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   313k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   313k|                                                     \
  |  |   53|   313k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   313k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   313k|                                                     \
  |  |   56|   313k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   313k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   313k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1861|       |
 1862|       |  // stage 5
 1863|   313k|  btf_16_adds_subs_sse2(x[0], x[2]);
  ------------------
  |  |   37|   313k|  do {                                  \
  |  |   38|   313k|    const __m128i _in0 = in0;           \
  |  |   39|   313k|    const __m128i _in1 = in1;           \
  |  |   40|   313k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   313k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   313k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1864|   313k|  btf_16_adds_subs_sse2(x[1], x[3]);
  ------------------
  |  |   37|   313k|  do {                                  \
  |  |   38|   313k|    const __m128i _in0 = in0;           \
  |  |   39|   313k|    const __m128i _in1 = in1;           \
  |  |   40|   313k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   313k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   313k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1865|   313k|  btf_16_adds_subs_sse2(x[4], x[6]);
  ------------------
  |  |   37|   313k|  do {                                  \
  |  |   38|   313k|    const __m128i _in0 = in0;           \
  |  |   39|   313k|    const __m128i _in1 = in1;           \
  |  |   40|   313k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   313k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   313k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1866|   313k|  btf_16_adds_subs_sse2(x[5], x[7]);
  ------------------
  |  |   37|   313k|  do {                                  \
  |  |   38|   313k|    const __m128i _in0 = in0;           \
  |  |   39|   313k|    const __m128i _in1 = in1;           \
  |  |   40|   313k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   313k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   313k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1867|       |
 1868|       |  // stage 6
 1869|   313k|  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   45|   313k|  do {                                               \
  |  |   46|   313k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   313k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   313k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   313k|                                                     \
  |  |   50|   313k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   313k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   313k|                                                     \
  |  |   53|   313k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   313k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   313k|                                                     \
  |  |   56|   313k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   313k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   313k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1870|   313k|  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
  ------------------
  |  |   45|   313k|  do {                                               \
  |  |   46|   313k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   313k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   313k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   313k|                                                     \
  |  |   50|   313k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   313k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   313k|                                                     \
  |  |   53|   313k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   313k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   313k|                                                     \
  |  |   56|   313k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   313k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   313k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1871|       |
 1872|       |  // stage 7
 1873|   313k|  output[0] = x[0];
 1874|   313k|  output[1] = _mm_subs_epi16(__zero, x[4]);
 1875|   313k|  output[2] = x[6];
 1876|   313k|  output[3] = _mm_subs_epi16(__zero, x[2]);
 1877|   313k|  output[4] = x[3];
 1878|   313k|  output[5] = _mm_subs_epi16(__zero, x[7]);
 1879|   313k|  output[6] = x[5];
 1880|   313k|  output[7] = _mm_subs_epi16(__zero, x[1]);
 1881|   313k|}
av1_inv_txfm_ssse3.c:idct16_w4_sse2:
  387|   402k|static void idct16_w4_sse2(const __m128i *input, __m128i *output) {
  388|   402k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|   402k|#define INV_COS_BIT 12
  ------------------
  389|   402k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   402k|#define INV_COS_BIT 12
  ------------------
  390|   402k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|   402k|#define INV_COS_BIT 12
  ------------------
  391|       |
  392|   402k|  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
  ------------------
  |  |   20|   402k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  393|   402k|  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
  ------------------
  |  |   20|   402k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  394|   402k|  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
  ------------------
  |  |   20|   402k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  395|   402k|  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
  ------------------
  |  |   20|   402k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  396|   402k|  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
  ------------------
  |  |   20|   402k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  397|   402k|  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
  ------------------
  |  |   20|   402k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  398|   402k|  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
  ------------------
  |  |   20|   402k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  399|   402k|  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
  ------------------
  |  |   20|   402k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  400|   402k|  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
  ------------------
  |  |   20|   402k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  401|   402k|  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
  ------------------
  |  |   20|   402k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  402|   402k|  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
  ------------------
  |  |   20|   402k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  403|   402k|  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
  ------------------
  |  |   20|   402k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  404|   402k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|   402k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  405|   402k|  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
  ------------------
  |  |   20|   402k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  406|   402k|  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
  ------------------
  |  |   20|   402k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  407|   402k|  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
  ------------------
  |  |   20|   402k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  408|   402k|  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
  ------------------
  |  |   20|   402k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  409|   402k|  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
  ------------------
  |  |   20|   402k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  410|   402k|  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
  ------------------
  |  |   20|   402k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  411|   402k|  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
  ------------------
  |  |   20|   402k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
  412|       |
  413|       |  // stage 1
  414|   402k|  __m128i x[16];
  415|   402k|  x[0] = input[0];
  416|   402k|  x[1] = input[8];
  417|   402k|  x[2] = input[4];
  418|   402k|  x[3] = input[12];
  419|   402k|  x[4] = input[2];
  420|   402k|  x[5] = input[10];
  421|   402k|  x[6] = input[6];
  422|   402k|  x[7] = input[14];
  423|   402k|  x[8] = input[1];
  424|   402k|  x[9] = input[9];
  425|   402k|  x[10] = input[5];
  426|   402k|  x[11] = input[13];
  427|   402k|  x[12] = input[3];
  428|   402k|  x[13] = input[11];
  429|   402k|  x[14] = input[7];
  430|   402k|  x[15] = input[15];
  431|       |
  432|       |  // stage 2
  433|   402k|  btf_16_4p_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
  ------------------
  |  |   45|   402k|  do {                                               \
  |  |   46|   402k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   402k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   402k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   402k|                                                     \
  |  |   50|   402k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   402k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   402k|                                                     \
  |  |   53|   402k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   402k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   402k|                                                     \
  |  |   56|   402k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   402k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  434|   402k|  btf_16_4p_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
  ------------------
  |  |   45|   402k|  do {                                               \
  |  |   46|   402k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   402k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   402k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   402k|                                                     \
  |  |   50|   402k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   402k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   402k|                                                     \
  |  |   53|   402k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   402k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   402k|                                                     \
  |  |   56|   402k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   402k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  435|   402k|  btf_16_4p_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
  ------------------
  |  |   45|   402k|  do {                                               \
  |  |   46|   402k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   402k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   402k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   402k|                                                     \
  |  |   50|   402k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   402k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   402k|                                                     \
  |  |   53|   402k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   402k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   402k|                                                     \
  |  |   56|   402k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   402k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  436|   402k|  btf_16_4p_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
  ------------------
  |  |   45|   402k|  do {                                               \
  |  |   46|   402k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   402k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   402k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   402k|                                                     \
  |  |   50|   402k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   402k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   402k|                                                     \
  |  |   53|   402k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   402k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   402k|                                                     \
  |  |   56|   402k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   402k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  437|       |
  438|       |  // stage 3
  439|   402k|  btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
  ------------------
  |  |   45|   402k|  do {                                               \
  |  |   46|   402k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   402k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   402k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   402k|                                                     \
  |  |   50|   402k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   402k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   402k|                                                     \
  |  |   53|   402k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   402k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   402k|                                                     \
  |  |   56|   402k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   402k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  440|   402k|  btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
  ------------------
  |  |   45|   402k|  do {                                               \
  |  |   46|   402k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   402k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   402k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   402k|                                                     \
  |  |   50|   402k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   402k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   402k|                                                     \
  |  |   53|   402k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   402k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   402k|                                                     \
  |  |   56|   402k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   402k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  441|   402k|  btf_16_adds_subs_sse2(x[8], x[9]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  442|   402k|  btf_16_subs_adds_sse2(x[11], x[10]);
  ------------------
  |  |   45|   402k|  do {                                  \
  |  |   46|   402k|    const __m128i _in0 = in0;           \
  |  |   47|   402k|    const __m128i _in1 = in1;           \
  |  |   48|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  443|   402k|  btf_16_adds_subs_sse2(x[12], x[13]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  444|   402k|  btf_16_subs_adds_sse2(x[15], x[14]);
  ------------------
  |  |   45|   402k|  do {                                  \
  |  |   46|   402k|    const __m128i _in0 = in0;           \
  |  |   47|   402k|    const __m128i _in1 = in1;           \
  |  |   48|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  445|       |
  446|       |  // stage 4
  447|   402k|  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
  ------------------
  |  |   45|   402k|  do {                                               \
  |  |   46|   402k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   402k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   402k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   402k|                                                     \
  |  |   50|   402k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   402k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   402k|                                                     \
  |  |   53|   402k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   402k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   402k|                                                     \
  |  |   56|   402k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   402k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  448|   402k|  btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   45|   402k|  do {                                               \
  |  |   46|   402k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   402k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   402k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   402k|                                                     \
  |  |   50|   402k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   402k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   402k|                                                     \
  |  |   53|   402k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   402k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   402k|                                                     \
  |  |   56|   402k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   402k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  449|   402k|  btf_16_adds_subs_sse2(x[4], x[5]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  450|   402k|  btf_16_subs_adds_sse2(x[7], x[6]);
  ------------------
  |  |   45|   402k|  do {                                  \
  |  |   46|   402k|    const __m128i _in0 = in0;           \
  |  |   47|   402k|    const __m128i _in1 = in1;           \
  |  |   48|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  451|   402k|  btf_16_4p_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
  ------------------
  |  |   45|   402k|  do {                                               \
  |  |   46|   402k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   402k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   402k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   402k|                                                     \
  |  |   50|   402k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   402k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   402k|                                                     \
  |  |   53|   402k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   402k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   402k|                                                     \
  |  |   56|   402k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   402k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  452|   402k|  btf_16_4p_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
  ------------------
  |  |   45|   402k|  do {                                               \
  |  |   46|   402k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   402k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   402k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   402k|                                                     \
  |  |   50|   402k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   402k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   402k|                                                     \
  |  |   53|   402k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   402k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   402k|                                                     \
  |  |   56|   402k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   402k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  453|       |
  454|       |  // stage 5
  455|   402k|  btf_16_adds_subs_sse2(x[0], x[3]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  456|   402k|  btf_16_adds_subs_sse2(x[1], x[2]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  457|   402k|  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
  ------------------
  |  |   45|   402k|  do {                                               \
  |  |   46|   402k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   402k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   402k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   402k|                                                     \
  |  |   50|   402k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   402k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   402k|                                                     \
  |  |   53|   402k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   402k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   402k|                                                     \
  |  |   56|   402k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   402k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  458|   402k|  btf_16_adds_subs_sse2(x[8], x[11]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  459|   402k|  btf_16_adds_subs_sse2(x[9], x[10]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  460|   402k|  btf_16_subs_adds_sse2(x[15], x[12]);
  ------------------
  |  |   45|   402k|  do {                                  \
  |  |   46|   402k|    const __m128i _in0 = in0;           \
  |  |   47|   402k|    const __m128i _in1 = in1;           \
  |  |   48|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  461|   402k|  btf_16_subs_adds_sse2(x[14], x[13]);
  ------------------
  |  |   45|   402k|  do {                                  \
  |  |   46|   402k|    const __m128i _in0 = in0;           \
  |  |   47|   402k|    const __m128i _in1 = in1;           \
  |  |   48|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   49|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   50|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (50:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  462|       |
  463|       |  // stage 6
  464|   402k|  btf_16_adds_subs_sse2(x[0], x[7]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  465|   402k|  btf_16_adds_subs_sse2(x[1], x[6]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  466|   402k|  btf_16_adds_subs_sse2(x[2], x[5]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  467|   402k|  btf_16_adds_subs_sse2(x[3], x[4]);
  ------------------
  |  |   37|   402k|  do {                                  \
  |  |   38|   402k|    const __m128i _in0 = in0;           \
  |  |   39|   402k|    const __m128i _in1 = in1;           \
  |  |   40|   402k|    in0 = _mm_adds_epi16(_in0, _in1);   \
  |  |   41|   402k|    in1 = _mm_subs_epi16(_in0, _in1);   \
  |  |   42|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (42:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  468|   402k|  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
  ------------------
  |  |   45|   402k|  do {                                               \
  |  |   46|   402k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   402k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   402k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   402k|                                                     \
  |  |   50|   402k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   402k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   402k|                                                     \
  |  |   53|   402k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   402k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   402k|                                                     \
  |  |   56|   402k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   402k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  469|   402k|  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
  ------------------
  |  |   45|   402k|  do {                                               \
  |  |   46|   402k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   402k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   402k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   402k|                                                     \
  |  |   50|   402k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   402k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   402k|                                                     \
  |  |   53|   402k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   402k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   402k|                                                     \
  |  |   56|   402k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   402k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   402k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  470|       |
  471|       |  // stage 7
  472|   402k|  idct16_stage7_sse2(output, x);
  473|   402k|}
av1_inv_txfm_ssse3.c:iadst16_w4_sse2:
 2117|   241k|static void iadst16_w4_sse2(const __m128i *input, __m128i *output) {
 2118|   241k|  const int8_t cos_bit = INV_COS_BIT;
  ------------------
  |  |   43|   241k|#define INV_COS_BIT 12
  ------------------
 2119|   241k|  const int32_t *cospi = cospi_arr(INV_COS_BIT);
  ------------------
  |  |   43|   241k|#define INV_COS_BIT 12
  ------------------
 2120|   241k|  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
  ------------------
  |  |   43|   241k|#define INV_COS_BIT 12
  ------------------
 2121|       |
 2122|   241k|  const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
  ------------------
  |  |   20|   241k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2123|   241k|  const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
  ------------------
  |  |   20|   241k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2124|   241k|  const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
  ------------------
  |  |   20|   241k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2125|   241k|  const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
  ------------------
  |  |   20|   241k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2126|   241k|  const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
  ------------------
  |  |   20|   241k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2127|   241k|  const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
  ------------------
  |  |   20|   241k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2128|   241k|  const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
  ------------------
  |  |   20|   241k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2129|   241k|  const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
  ------------------
  |  |   20|   241k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2130|   241k|  const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
  ------------------
  |  |   20|   241k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2131|   241k|  const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
  ------------------
  |  |   20|   241k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2132|   241k|  const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
  ------------------
  |  |   20|   241k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2133|   241k|  const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
  ------------------
  |  |   20|   241k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2134|   241k|  const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
  ------------------
  |  |   20|   241k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2135|   241k|  const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
  ------------------
  |  |   20|   241k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2136|   241k|  const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
  ------------------
  |  |   20|   241k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2137|   241k|  const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
  ------------------
  |  |   20|   241k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2138|   241k|  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
  ------------------
  |  |   20|   241k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2139|   241k|  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
  ------------------
  |  |   20|   241k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2140|   241k|  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
  ------------------
  |  |   20|   241k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2141|   241k|  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
  ------------------
  |  |   20|   241k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2142|   241k|  const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
  ------------------
  |  |   20|   241k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2143|   241k|  const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
  ------------------
  |  |   20|   241k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2144|   241k|  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
  ------------------
  |  |   20|   241k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2145|   241k|  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
  ------------------
  |  |   20|   241k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2146|   241k|  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
  ------------------
  |  |   20|   241k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2147|   241k|  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
  ------------------
  |  |   20|   241k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2148|   241k|  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
  ------------------
  |  |   20|   241k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2149|       |
 2150|       |  // stage 1
 2151|   241k|  __m128i x[16];
 2152|   241k|  x[0] = input[15];
 2153|   241k|  x[1] = input[0];
 2154|   241k|  x[2] = input[13];
 2155|   241k|  x[3] = input[2];
 2156|   241k|  x[4] = input[11];
 2157|   241k|  x[5] = input[4];
 2158|   241k|  x[6] = input[9];
 2159|   241k|  x[7] = input[6];
 2160|   241k|  x[8] = input[7];
 2161|   241k|  x[9] = input[8];
 2162|   241k|  x[10] = input[5];
 2163|   241k|  x[11] = input[10];
 2164|   241k|  x[12] = input[3];
 2165|   241k|  x[13] = input[12];
 2166|   241k|  x[14] = input[1];
 2167|   241k|  x[15] = input[14];
 2168|       |
 2169|       |  // stage 2
 2170|   241k|  btf_16_4p_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
  ------------------
  |  |   45|   241k|  do {                                               \
  |  |   46|   241k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   241k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   241k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   241k|                                                     \
  |  |   50|   241k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   241k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   241k|                                                     \
  |  |   53|   241k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   241k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   241k|                                                     \
  |  |   56|   241k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   241k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   241k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2171|   241k|  btf_16_4p_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   45|   241k|  do {                                               \
  |  |   46|   241k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   241k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   241k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   241k|                                                     \
  |  |   50|   241k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   241k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   241k|                                                     \
  |  |   53|   241k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   241k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   241k|                                                     \
  |  |   56|   241k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   241k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   241k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2172|   241k|  btf_16_4p_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
  ------------------
  |  |   45|   241k|  do {                                               \
  |  |   46|   241k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   241k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   241k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   241k|                                                     \
  |  |   50|   241k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   241k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   241k|                                                     \
  |  |   53|   241k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   241k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   241k|                                                     \
  |  |   56|   241k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   241k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   241k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2173|   241k|  btf_16_4p_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
  ------------------
  |  |   45|   241k|  do {                                               \
  |  |   46|   241k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   241k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   241k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   241k|                                                     \
  |  |   50|   241k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   241k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   241k|                                                     \
  |  |   53|   241k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   241k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   241k|                                                     \
  |  |   56|   241k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   241k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   241k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2174|   241k|  btf_16_4p_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
  ------------------
  |  |   45|   241k|  do {                                               \
  |  |   46|   241k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   241k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   241k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   241k|                                                     \
  |  |   50|   241k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   241k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   241k|                                                     \
  |  |   53|   241k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   241k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   241k|                                                     \
  |  |   56|   241k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   241k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   241k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2175|   241k|  btf_16_4p_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
  ------------------
  |  |   45|   241k|  do {                                               \
  |  |   46|   241k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   241k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   241k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   241k|                                                     \
  |  |   50|   241k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   241k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   241k|                                                     \
  |  |   53|   241k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   241k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   241k|                                                     \
  |  |   56|   241k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   241k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   241k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2176|   241k|  btf_16_4p_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
  ------------------
  |  |   45|   241k|  do {                                               \
  |  |   46|   241k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   241k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   241k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   241k|                                                     \
  |  |   50|   241k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   241k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   241k|                                                     \
  |  |   53|   241k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   241k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   241k|                                                     \
  |  |   56|   241k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   241k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   241k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2177|   241k|  btf_16_4p_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
  ------------------
  |  |   45|   241k|  do {                                               \
  |  |   46|   241k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   241k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   241k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   241k|                                                     \
  |  |   50|   241k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   241k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   241k|                                                     \
  |  |   53|   241k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   241k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   241k|                                                     \
  |  |   56|   241k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   241k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   241k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2178|       |
 2179|       |  // stage 3
 2180|   241k|  iadst16_stage3_ssse3(x);
 2181|       |
 2182|       |  // stage 4
 2183|   241k|  btf_16_4p_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
  ------------------
  |  |   45|   241k|  do {                                               \
  |  |   46|   241k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   241k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   241k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   241k|                                                     \
  |  |   50|   241k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   241k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   241k|                                                     \
  |  |   53|   241k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   241k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   241k|                                                     \
  |  |   56|   241k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   241k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   241k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2184|   241k|  btf_16_4p_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
  ------------------
  |  |   45|   241k|  do {                                               \
  |  |   46|   241k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   241k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   241k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   241k|                                                     \
  |  |   50|   241k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   241k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   241k|                                                     \
  |  |   53|   241k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   241k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   241k|                                                     \
  |  |   56|   241k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   241k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   241k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2185|   241k|  btf_16_4p_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
  ------------------
  |  |   45|   241k|  do {                                               \
  |  |   46|   241k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   241k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   241k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   241k|                                                     \
  |  |   50|   241k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   241k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   241k|                                                     \
  |  |   53|   241k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   241k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   241k|                                                     \
  |  |   56|   241k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   241k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   241k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2186|   241k|  btf_16_4p_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
  ------------------
  |  |   45|   241k|  do {                                               \
  |  |   46|   241k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   241k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   241k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   241k|                                                     \
  |  |   50|   241k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   241k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   241k|                                                     \
  |  |   53|   241k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   241k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   241k|                                                     \
  |  |   56|   241k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   241k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   241k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2187|       |
 2188|       |  // stage 5
 2189|   241k|  iadst16_stage5_ssse3(x);
 2190|       |
 2191|       |  // stage 6
 2192|   241k|  btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
  ------------------
  |  |   45|   241k|  do {                                               \
  |  |   46|   241k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   241k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   241k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   241k|                                                     \
  |  |   50|   241k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   241k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   241k|                                                     \
  |  |   53|   241k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   241k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   241k|                                                     \
  |  |   56|   241k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   241k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   241k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2193|   241k|  btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
  ------------------
  |  |   45|   241k|  do {                                               \
  |  |   46|   241k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   241k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   241k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   241k|                                                     \
  |  |   50|   241k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   241k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   241k|                                                     \
  |  |   53|   241k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   241k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   241k|                                                     \
  |  |   56|   241k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   241k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   241k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2194|   241k|  btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
  ------------------
  |  |   45|   241k|  do {                                               \
  |  |   46|   241k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   241k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   241k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   241k|                                                     \
  |  |   50|   241k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   241k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   241k|                                                     \
  |  |   53|   241k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   241k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   241k|                                                     \
  |  |   56|   241k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   241k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   241k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2195|   241k|  btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
  ------------------
  |  |   45|   241k|  do {                                               \
  |  |   46|   241k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   241k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   241k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   241k|                                                     \
  |  |   50|   241k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   241k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   241k|                                                     \
  |  |   53|   241k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   241k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   241k|                                                     \
  |  |   56|   241k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   241k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   241k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2196|       |
 2197|       |  // stage 7
 2198|   241k|  iadst16_stage7_ssse3(x);
 2199|       |
 2200|       |  // stage 8
 2201|   241k|  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
  ------------------
  |  |   45|   241k|  do {                                               \
  |  |   46|   241k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   241k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   241k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   241k|                                                     \
  |  |   50|   241k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   241k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   241k|                                                     \
  |  |   53|   241k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   241k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   241k|                                                     \
  |  |   56|   241k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   241k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   241k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2202|   241k|  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
  ------------------
  |  |   45|   241k|  do {                                               \
  |  |   46|   241k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   241k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   241k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   241k|                                                     \
  |  |   50|   241k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   241k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   241k|                                                     \
  |  |   53|   241k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   241k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   241k|                                                     \
  |  |   56|   241k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   241k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   241k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2203|   241k|  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
  ------------------
  |  |   45|   241k|  do {                                               \
  |  |   46|   241k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   241k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   241k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   241k|                                                     \
  |  |   50|   241k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   241k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   241k|                                                     \
  |  |   53|   241k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   241k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   241k|                                                     \
  |  |   56|   241k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   241k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   241k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2204|   241k|  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
  ------------------
  |  |   45|   241k|  do {                                               \
  |  |   46|   241k|    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
  |  |   47|   241k|    __m128i u0 = _mm_madd_epi16(t0, w0);             \
  |  |   48|   241k|    __m128i v0 = _mm_madd_epi16(t0, w1);             \
  |  |   49|   241k|                                                     \
  |  |   50|   241k|    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
  |  |   51|   241k|    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
  |  |   52|   241k|                                                     \
  |  |   53|   241k|    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
  |  |   54|   241k|    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
  |  |   55|   241k|                                                     \
  |  |   56|   241k|    out0 = _mm_packs_epi32(c0, c0);                  \
  |  |   57|   241k|    out1 = _mm_packs_epi32(d0, d0);                  \
  |  |   58|   241k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (58:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 2205|       |
 2206|       |  // stage 9
 2207|   241k|  iadst16_stage9_ssse3(output, x);
 2208|   241k|}
av1_inv_txfm_ssse3.c:iidentity16_ssse3:
 2225|  64.5k|static void iidentity16_ssse3(const __m128i *input, __m128i *output) {
 2226|  64.5k|  const int16_t scale_fractional = 2 * (NewSqrt2 - (1 << NewSqrt2Bits));
  ------------------
  |  |   41|  64.5k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2227|  64.5k|  const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
  ------------------
  |  |   41|  64.5k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2228|  1.09M|  for (int i = 0; i < 16; ++i) {
  ------------------
  |  Branch (2228:19): [True: 1.03M, False: 64.5k]
  ------------------
 2229|  1.03M|    __m128i x = _mm_mulhrs_epi16(input[i], scale);
 2230|  1.03M|    __m128i srcx2 = _mm_adds_epi16(input[i], input[i]);
 2231|  1.03M|    output[i] = _mm_adds_epi16(x, srcx2);
 2232|  1.03M|  }
 2233|  64.5k|}
av1_inv_txfm_ssse3.c:lowbd_write_buffer_4xn_sse2:
 2244|  1.24M|                                               const int height) {
 2245|  1.24M|  int j = flipud ? (height - 1) : 0;
  ------------------
  |  Branch (2245:11): [True: 61.9k, False: 1.18M]
  ------------------
 2246|  1.24M|  const int step = flipud ? -1 : 1;
  ------------------
  |  Branch (2246:20): [True: 61.9k, False: 1.18M]
  ------------------
 2247|  1.24M|  const __m128i zero = _mm_setzero_si128();
 2248|  10.9M|  for (int i = 0; i < height; ++i, j += step) {
  ------------------
  |  Branch (2248:19): [True: 9.65M, False: 1.24M]
  ------------------
 2249|  9.65M|    const __m128i v = _mm_cvtsi32_si128(*((int *)(output + i * stride)));
 2250|  9.65M|    __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero));
 2251|  9.65M|    u = _mm_packus_epi16(u, zero);
 2252|  9.65M|    *((int *)(output + i * stride)) = _mm_cvtsi128_si32(u);
 2253|  9.65M|  }
 2254|  1.24M|}
av1_inv_txfm_ssse3.c:lowbd_inv_txfm2d_add_4x8_ssse3:
 2678|   399k|                                           int eob) {
 2679|   399k|  (void)tx_size_;
 2680|   399k|  (void)eob;
 2681|   399k|  __m128i buf[8];
 2682|   399k|  const TX_SIZE tx_size = TX_4X8;
 2683|   399k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 2684|   399k|  const int txw_idx = get_txw_idx(tx_size);
 2685|   399k|  const int txh_idx = get_txh_idx(tx_size);
 2686|   399k|  const int txfm_size_col = tx_size_wide[tx_size];
 2687|   399k|  const int txfm_size_row = tx_size_high[tx_size];
 2688|       |
 2689|   399k|  const transform_1d_ssse3 row_txfm =
 2690|   399k|      lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
 2691|   399k|  const transform_1d_ssse3 col_txfm =
 2692|   399k|      lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
 2693|       |
 2694|   399k|  int ud_flip, lr_flip;
 2695|   399k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 2696|   399k|  load_buffer_32bit_to_16bit(input, txfm_size_row, buf, txfm_size_col);
 2697|   399k|  round_shift_ssse3(buf, buf, txfm_size_col);  // rect special code
 2698|   399k|  row_txfm(buf, buf);
 2699|       |  // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);// shift[0] is 0
 2700|   399k|  if (lr_flip) {
  ------------------
  |  Branch (2700:7): [True: 34.4k, False: 364k]
  ------------------
 2701|  34.4k|    __m128i temp[4];
 2702|  34.4k|    flip_buf_sse2(buf, temp, txfm_size_col);
 2703|  34.4k|    transpose_16bit_8x4(temp, buf);
 2704|   364k|  } else {
 2705|   364k|    transpose_16bit_8x4(buf, buf);
 2706|   364k|  }
 2707|   399k|  col_txfm(buf, buf);
 2708|   399k|  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
 2709|   399k|  lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
 2710|   399k|}
av1_inv_txfm_ssse3.c:lowbd_inv_txfm2d_add_8x4_ssse3:
 2715|   543k|                                           int eob) {
 2716|   543k|  (void)tx_size_;
 2717|   543k|  (void)eob;
 2718|   543k|  __m128i buf[8];
 2719|   543k|  const TX_SIZE tx_size = TX_8X4;
 2720|   543k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 2721|   543k|  const int txw_idx = get_txw_idx(tx_size);
 2722|   543k|  const int txh_idx = get_txh_idx(tx_size);
 2723|   543k|  const int txfm_size_col = tx_size_wide[tx_size];
 2724|   543k|  const int txfm_size_row = tx_size_high[tx_size];
 2725|       |
 2726|   543k|  const transform_1d_ssse3 row_txfm =
 2727|   543k|      lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
 2728|   543k|  const transform_1d_ssse3 col_txfm =
 2729|   543k|      lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
 2730|       |
 2731|   543k|  int ud_flip, lr_flip;
 2732|   543k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 2733|   543k|  load_buffer_32bit_to_16bit_w4(input, txfm_size_row, buf, txfm_size_col);
 2734|   543k|  round_shift_ssse3(buf, buf, txfm_size_col);  // rect special code
 2735|   543k|  row_txfm(buf, buf);
 2736|       |  // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); // shift[0] is 0
 2737|   543k|  if (lr_flip) {
  ------------------
  |  Branch (2737:7): [True: 41.8k, False: 501k]
  ------------------
 2738|  41.8k|    __m128i temp[8];
 2739|  41.8k|    flip_buf_sse2(buf, temp, txfm_size_col);
 2740|  41.8k|    transpose_16bit_4x8(temp, buf);
 2741|   501k|  } else {
 2742|   501k|    transpose_16bit_4x8(buf, buf);
 2743|   501k|  }
 2744|   543k|  col_txfm(buf, buf);
 2745|   543k|  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
 2746|   543k|  lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
 2747|   543k|}
av1_inv_txfm_ssse3.c:lowbd_write_buffer_8xn_sse2:
 2258|  1.84M|                                               const int height) {
 2259|  1.84M|  int j = flipud ? (height - 1) : 0;
  ------------------
  |  Branch (2259:11): [True: 59.9k, False: 1.78M]
  ------------------
 2260|  1.84M|  const int step = flipud ? -1 : 1;
  ------------------
  |  Branch (2260:20): [True: 59.9k, False: 1.78M]
  ------------------
 2261|  13.8M|  for (int i = 0; i < height; ++i, j += step) {
  ------------------
  |  Branch (2261:19): [True: 12.0M, False: 1.84M]
  ------------------
 2262|  12.0M|    const __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride));
 2263|  12.0M|    const __m128i u = lowbd_get_recon_8x8_sse2(v, in[j]);
 2264|  12.0M|    _mm_storel_epi64((__m128i *)(output + i * stride), u);
 2265|  12.0M|  }
 2266|  1.84M|}
av1_inv_txfm_ssse3.c:lowbd_inv_txfm2d_add_4x16_ssse3:
 2752|   256k|                                            int eob) {
 2753|   256k|  (void)tx_size_;
 2754|   256k|  (void)eob;
 2755|   256k|  __m128i buf[16];
 2756|   256k|  const TX_SIZE tx_size = TX_4X16;
 2757|   256k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 2758|   256k|  const int txw_idx = get_txw_idx(tx_size);
 2759|   256k|  const int txh_idx = get_txh_idx(tx_size);
 2760|   256k|  const int txfm_size_col = tx_size_wide[tx_size];
 2761|   256k|  const int txfm_size_row = tx_size_high[tx_size];
 2762|       |
 2763|   256k|  const transform_1d_ssse3 row_txfm =
 2764|   256k|      lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
 2765|   256k|  const transform_1d_ssse3 col_txfm =
 2766|   256k|      lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
 2767|       |
 2768|   256k|  int ud_flip, lr_flip;
 2769|   256k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 2770|       |
 2771|   256k|  const int row_one_loop = 8;
 2772|   770k|  for (int i = 0; i < 2; ++i) {
  ------------------
  |  Branch (2772:19): [True: 513k, False: 256k]
  ------------------
 2773|   513k|    const int32_t *input_cur = input + i * row_one_loop;
 2774|   513k|    __m128i *buf_cur = buf + i * row_one_loop;
 2775|   513k|    load_buffer_32bit_to_16bit(input_cur, txfm_size_row, buf_cur,
 2776|   513k|                               txfm_size_col);
 2777|   513k|    if (row_txfm == iidentity4_ssse3) {
  ------------------
  |  Branch (2777:9): [True: 98.4k, False: 415k]
  ------------------
 2778|  98.4k|      const __m128i scale = pair_set_epi16(NewSqrt2, 3 << (NewSqrt2Bits - 1));
  ------------------
  |  |   20|  98.4k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2779|  98.4k|      const __m128i ones = _mm_set1_epi16(1);
 2780|   492k|      for (int j = 0; j < 4; ++j) {
  ------------------
  |  Branch (2780:23): [True: 393k, False: 98.4k]
  ------------------
 2781|   393k|        const __m128i buf_lo = _mm_unpacklo_epi16(buf_cur[j], ones);
 2782|   393k|        const __m128i buf_hi = _mm_unpackhi_epi16(buf_cur[j], ones);
 2783|   393k|        const __m128i buf_32_lo =
 2784|   393k|            _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1));
  ------------------
  |  |   41|   393k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2785|   393k|        const __m128i buf_32_hi =
 2786|   393k|            _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1));
  ------------------
  |  |   41|   393k|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2787|   393k|        buf_cur[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi);
 2788|   393k|      }
 2789|   415k|    } else {
 2790|   415k|      row_txfm(buf_cur, buf_cur);
 2791|   415k|      round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]);
 2792|   415k|    }
 2793|   513k|    if (lr_flip) {
  ------------------
  |  Branch (2793:9): [True: 21.8k, False: 491k]
  ------------------
 2794|  21.8k|      __m128i temp[8];
 2795|  21.8k|      flip_buf_sse2(buf_cur, temp, txfm_size_col);
 2796|  21.8k|      transpose_16bit_8x4(temp, buf_cur);
 2797|   491k|    } else {
 2798|   491k|      transpose_16bit_8x4(buf_cur, buf_cur);
 2799|   491k|    }
 2800|   513k|  }
 2801|   256k|  col_txfm(buf, buf);
 2802|   256k|  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
 2803|   256k|  lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
 2804|   256k|}
av1_inv_txfm_ssse3.c:lowbd_inv_txfm2d_add_16x4_ssse3:
 2809|   515k|                                            int eob) {
 2810|   515k|  (void)tx_size_;
 2811|   515k|  (void)eob;
 2812|   515k|  __m128i buf[16];
 2813|   515k|  const TX_SIZE tx_size = TX_16X4;
 2814|   515k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 2815|   515k|  const int txw_idx = get_txw_idx(tx_size);
 2816|   515k|  const int txh_idx = get_txh_idx(tx_size);
 2817|   515k|  const int txfm_size_col = tx_size_wide[tx_size];
 2818|   515k|  const int txfm_size_row = tx_size_high[tx_size];
 2819|   515k|  const int buf_size_w_div8 = txfm_size_col >> 3;
 2820|       |
 2821|   515k|  const transform_1d_ssse3 row_txfm =
 2822|   515k|      lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
 2823|   515k|  const transform_1d_ssse3 col_txfm =
 2824|   515k|      lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
 2825|       |
 2826|   515k|  int ud_flip, lr_flip;
 2827|   515k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 2828|   515k|  const int row_one_loop = 8;
 2829|   515k|  load_buffer_32bit_to_16bit_w4(input, txfm_size_row, buf, txfm_size_col);
 2830|   515k|  if (row_txfm == iidentity16_ssse3) {
  ------------------
  |  Branch (2830:7): [True: 63.3k, False: 451k]
  ------------------
 2831|  63.3k|    const __m128i scale = pair_set_epi16(2 * NewSqrt2, 3 << (NewSqrt2Bits - 1));
  ------------------
  |  |   20|  63.3k|  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)))
  ------------------
 2832|  63.3k|    const __m128i ones = _mm_set1_epi16(1);
 2833|  1.07M|    for (int j = 0; j < 16; ++j) {
  ------------------
  |  Branch (2833:21): [True: 1.01M, False: 63.3k]
  ------------------
 2834|  1.01M|      const __m128i buf_lo = _mm_unpacklo_epi16(buf[j], ones);
 2835|  1.01M|      const __m128i buf_hi = _mm_unpackhi_epi16(buf[j], ones);
 2836|  1.01M|      const __m128i buf_32_lo =
 2837|  1.01M|          _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1));
  ------------------
  |  |   41|  1.01M|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2838|  1.01M|      const __m128i buf_32_hi =
 2839|  1.01M|          _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1));
  ------------------
  |  |   41|  1.01M|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 2840|  1.01M|      buf[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi);
 2841|  1.01M|    }
 2842|   451k|  } else {
 2843|   451k|    row_txfm(buf, buf);
 2844|   451k|    round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);
 2845|   451k|  }
 2846|   515k|  if (lr_flip) {
  ------------------
  |  Branch (2846:7): [True: 12.2k, False: 503k]
  ------------------
 2847|  12.2k|    __m128i temp[16];
 2848|  12.2k|    flip_buf_sse2(buf, temp, 16);
 2849|  12.2k|    transpose_16bit_4x8(temp, buf);
 2850|  12.2k|    transpose_16bit_4x8(temp + 8, buf + 8);
 2851|   503k|  } else {
 2852|   503k|    transpose_16bit_4x8(buf, buf);
 2853|   503k|    transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop);
 2854|   503k|  }
 2855|  1.54M|  for (int i = 0; i < buf_size_w_div8; i++) {
  ------------------
  |  Branch (2855:19): [True: 1.03M, False: 515k]
  ------------------
 2856|  1.03M|    col_txfm(buf + i * row_one_loop, buf + i * row_one_loop);
 2857|  1.03M|    round_shift_16bit_ssse3(buf + i * row_one_loop, txfm_size_row, shift[1]);
 2858|  1.03M|  }
 2859|   515k|  lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, 4);
 2860|   515k|  lowbd_write_buffer_8xn_sse2(buf + 8, output + 8, stride, ud_flip, 4);
 2861|   515k|}
av1_inv_txfm_ssse3.c:lowbd_inv_txfm2d_add_universe_ssse3:
 2647|  1.18M|    TX_SIZE tx_size, int eob) {
 2648|  1.18M|  switch (tx_type) {
 2649|   725k|    case DCT_DCT:
  ------------------
  |  Branch (2649:5): [True: 725k, False: 459k]
  ------------------
 2650|   725k|      lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
 2651|   725k|                                             tx_size, eob);
 2652|   725k|      break;
 2653|  86.4k|    case IDTX:
  ------------------
  |  Branch (2653:5): [True: 86.4k, False: 1.09M]
  ------------------
 2654|  86.4k|      av1_lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size);
 2655|  86.4k|      break;
 2656|  15.7k|    case V_DCT:
  ------------------
  |  Branch (2656:5): [True: 15.7k, False: 1.16M]
  ------------------
 2657|  20.1k|    case V_ADST:
  ------------------
  |  Branch (2657:5): [True: 4.35k, False: 1.18M]
  ------------------
 2658|  24.9k|    case V_FLIPADST:
  ------------------
  |  Branch (2658:5): [True: 4.82k, False: 1.18M]
  ------------------
 2659|  24.9k|      av1_lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type,
 2660|  24.9k|                                                tx_size, eob);
 2661|  24.9k|      break;
 2662|  42.9k|    case H_DCT:
  ------------------
  |  Branch (2662:5): [True: 42.9k, False: 1.14M]
  ------------------
 2663|  50.4k|    case H_ADST:
  ------------------
  |  Branch (2663:5): [True: 7.52k, False: 1.17M]
  ------------------
 2664|  55.9k|    case H_FLIPADST:
  ------------------
  |  Branch (2664:5): [True: 5.47k, False: 1.17M]
  ------------------
 2665|  55.9k|      av1_lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type,
 2666|  55.9k|                                                tx_size, eob);
 2667|  55.9k|      break;
 2668|   292k|    default:
  ------------------
  |  Branch (2668:5): [True: 292k, False: 892k]
  ------------------
 2669|   292k|      lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
 2670|   292k|                                             tx_size, eob);
 2671|   292k|      break;
 2672|  1.18M|  }
 2673|  1.18M|}
av1_inv_txfm_ssse3.c:lowbd_inv_txfm2d_add_no_identity_ssse3:
 2476|  1.01M|    TX_SIZE tx_size, int eob) {
 2477|  1.01M|  __m128i buf1[64 * 8];
 2478|  1.01M|  int eobx, eoby;
 2479|  1.01M|  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
 2480|  1.01M|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 2481|  1.01M|  const int txw_idx = get_txw_idx(tx_size);
 2482|  1.01M|  const int txh_idx = get_txh_idx(tx_size);
 2483|  1.01M|  const int txfm_size_col = tx_size_wide[tx_size];
 2484|  1.01M|  const int txfm_size_row = tx_size_high[tx_size];
 2485|  1.01M|  const int buf_size_w_div8 = txfm_size_col >> 3;
 2486|  1.01M|  const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3;
 2487|  1.01M|  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
 2488|  1.01M|  const int input_stride = AOMMIN(32, txfm_size_row);
  ------------------
  |  |   34|  1.01M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 1.01M]
  |  |  ------------------
  ------------------
 2489|  1.01M|  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
 2490|       |
 2491|  1.01M|  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
 2492|  1.01M|  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
 2493|  1.01M|  const transform_1d_ssse3 row_txfm =
 2494|  1.01M|      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
 2495|  1.01M|  const transform_1d_ssse3 col_txfm =
 2496|  1.01M|      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
 2497|       |
 2498|  1.01M|  assert(col_txfm != NULL);
 2499|  1.01M|  assert(row_txfm != NULL);
 2500|  1.01M|  int ud_flip, lr_flip;
 2501|  1.01M|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 2502|  2.09M|  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
  ------------------
  |  Branch (2502:19): [True: 1.07M, False: 1.01M]
  ------------------
 2503|  1.07M|    __m128i buf0[64];
 2504|  1.07M|    load_buffer_32bit_to_16bit(input + 8 * i, input_stride, buf0,
 2505|  1.07M|                               buf_size_nonzero_w);
 2506|  1.07M|    if (rect_type == 1 || rect_type == -1) {
  ------------------
  |  Branch (2506:9): [True: 351k, False: 725k]
  |  Branch (2506:27): [True: 223k, False: 501k]
  ------------------
 2507|   575k|      round_shift_ssse3(buf0, buf0, buf_size_nonzero_w);  // rect special code
 2508|   575k|    }
 2509|  1.07M|    row_txfm(buf0, buf0);
 2510|  1.07M|    round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
 2511|  1.07M|    __m128i *_buf1 = buf1 + i * 8;
 2512|  1.07M|    if (lr_flip) {
  ------------------
  |  Branch (2512:9): [True: 38.8k, False: 1.03M]
  ------------------
 2513|  99.0k|      for (int j = 0; j < buf_size_w_div8; ++j) {
  ------------------
  |  Branch (2513:23): [True: 60.2k, False: 38.8k]
  ------------------
 2514|  60.2k|        __m128i temp[8];
 2515|  60.2k|        flip_buf_sse2(buf0 + 8 * j, temp, 8);
 2516|  60.2k|        transpose_16bit_8x8(temp,
 2517|  60.2k|                            _buf1 + txfm_size_row * (buf_size_w_div8 - 1 - j));
 2518|  60.2k|      }
 2519|  1.03M|    } else {
 2520|  3.57M|      for (int j = 0; j < buf_size_w_div8; ++j) {
  ------------------
  |  Branch (2520:23): [True: 2.53M, False: 1.03M]
  ------------------
 2521|  2.53M|        transpose_16bit_8x8(buf0 + 8 * j, _buf1 + txfm_size_row * j);
 2522|  2.53M|      }
 2523|  1.03M|    }
 2524|  1.07M|  }
 2525|  3.55M|  for (int i = 0; i < buf_size_w_div8; i++) {
  ------------------
  |  Branch (2525:19): [True: 2.53M, False: 1.01M]
  ------------------
 2526|  2.53M|    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row);
 2527|  2.53M|    round_shift_16bit_ssse3(buf1 + i * txfm_size_row, txfm_size_row, shift[1]);
 2528|  2.53M|  }
 2529|       |
 2530|  1.01M|  if (txfm_size_col >= 16) {
  ------------------
  |  Branch (2530:7): [True: 741k, False: 276k]
  ------------------
 2531|  1.87M|    for (int i = 0; i < (txfm_size_col >> 4); i++) {
  ------------------
  |  Branch (2531:21): [True: 1.13M, False: 741k]
  ------------------
 2532|  1.13M|      lowbd_write_buffer_16xn_sse2(buf1 + i * txfm_size_row * 2,
 2533|  1.13M|                                   output + 16 * i, stride, ud_flip,
 2534|  1.13M|                                   txfm_size_row);
 2535|  1.13M|    }
 2536|   741k|  } else if (txfm_size_col == 8) {
  ------------------
  |  Branch (2536:14): [True: 276k, False: 61]
  ------------------
 2537|   276k|    lowbd_write_buffer_8xn_sse2(buf1, output, stride, ud_flip, txfm_size_row);
 2538|   276k|  }
 2539|  1.01M|}
av1_inv_txfm_ssse3.c:lowbd_write_buffer_16xn_sse2:
 2456|  1.13M|                                                int height) {
 2457|  1.13M|  int j = flipud ? (height - 1) : 0;
  ------------------
  |  Branch (2457:11): [True: 14.5k, False: 1.11M]
  ------------------
 2458|  1.13M|  const int step = flipud ? -1 : 1;
  ------------------
  |  Branch (2458:20): [True: 14.5k, False: 1.11M]
  ------------------
 2459|  10.1M|  for (int i = 0; i < height; ++i, j += step) {
  ------------------
  |  Branch (2459:19): [True: 9.05M, False: 1.13M]
  ------------------
 2460|  9.05M|    __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
 2461|  9.05M|    __m128i u = lowbd_get_recon_16x16_sse2(v, in[j], in[j + height]);
 2462|  9.05M|    _mm_storeu_si128((__m128i *)(output + i * stride), u);
 2463|  9.05M|  }
 2464|  1.13M|}
av1_inv_txfm_ssse3.c:lowbd_get_recon_16x16_sse2:
 2445|  9.05M|                                                 __m128i res0, __m128i res1) {
 2446|  9.05M|  const __m128i zero = _mm_setzero_si128();
 2447|  9.05M|  __m128i x0 = _mm_unpacklo_epi8(pred, zero);
 2448|  9.05M|  __m128i x1 = _mm_unpackhi_epi8(pred, zero);
 2449|  9.05M|  x0 = _mm_adds_epi16(res0, x0);
 2450|  9.05M|  x1 = _mm_adds_epi16(res1, x1);
 2451|  9.05M|  return _mm_packus_epi16(x0, x1);
 2452|  9.05M|}

av1_inv_txfm_ssse3.c:get_eobx_eoby_scan_h_identity:
  202|  56.4k|                                                 TX_SIZE tx_size, int eob) {
  203|  56.4k|  eob -= 1;
  204|  56.4k|  const int txfm_size_col = tx_size_wide[tx_size];
  205|  56.4k|  const int eobx_max = AOMMIN(32, txfm_size_col) - 1;
  ------------------
  |  |   34|  56.4k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 56.4k]
  |  |  ------------------
  ------------------
  206|  56.4k|  *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob];
  ------------------
  |  Branch (206:11): [True: 39.4k, False: 16.9k]
  ------------------
  207|  56.4k|  const int temp_eoby = eob / (eobx_max + 1);
  208|  56.4k|  assert(temp_eoby < 32);
  209|  56.4k|  *eoby = eob_fill[temp_eoby];
  210|  56.4k|}
av1_inv_txfm_ssse3.c:get_eobx_eoby_scan_v_identity:
  213|   123k|                                                 TX_SIZE tx_size, int eob) {
  214|   123k|  eob -= 1;
  215|   123k|  const int txfm_size_row = tx_size_high[tx_size];
  216|   123k|  const int eoby_max = AOMMIN(32, txfm_size_row) - 1;
  ------------------
  |  |   34|   123k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 123k]
  |  |  ------------------
  ------------------
  217|   123k|  *eobx = eob_fill[eob / (eoby_max + 1)];
  218|   123k|  *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
  ------------------
  |  Branch (218:11): [True: 81.0k, False: 42.5k]
  ------------------
  219|   123k|}
av1_inv_txfm_ssse3.c:round_shift_16bit_ssse3:
   60|  7.43M|static inline void round_shift_16bit_ssse3(__m128i *in, int size, int bit) {
   61|  7.43M|  if (bit < 0) {
  ------------------
  |  Branch (61:7): [True: 7.43M, False: 18.4E]
  ------------------
   62|  7.43M|    const __m128i scale = _mm_set1_epi16(1 << (15 + bit));
   63|  79.9M|    for (int i = 0; i < size; ++i) {
  ------------------
  |  Branch (63:21): [True: 72.5M, False: 7.43M]
  ------------------
   64|  72.5M|      in[i] = _mm_mulhrs_epi16(in[i], scale);
   65|  72.5M|    }
   66|  18.4E|  } else if (bit > 0) {
  ------------------
  |  Branch (66:14): [True: 0, False: 18.4E]
  ------------------
   67|      0|    for (int i = 0; i < size; ++i) {
  ------------------
  |  Branch (67:21): [True: 0, False: 0]
  ------------------
   68|      0|      in[i] = _mm_slli_epi16(in[i], bit);
   69|      0|    }
   70|      0|  }
   71|  7.43M|}
av1_inv_txfm_ssse3.c:get_eobx_eoby_scan_default:
  182|  1.01M|                                              TX_SIZE tx_size, int eob) {
  183|  1.01M|  if (eob == 1) {
  ------------------
  |  Branch (183:7): [True: 324k, False: 693k]
  ------------------
  184|   324k|    *eobx = 0;
  185|   324k|    *eoby = 0;
  186|   324k|    return;
  187|   324k|  }
  188|       |
  189|   693k|  const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
  190|   693k|  const int eob_row = (eob - 1) >> tx_w_log2;
  191|   693k|  const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
  192|   693k|  *eobx = eobxy & 0xFF;
  193|   693k|  *eoby = eobxy >> 8;
  194|   693k|}
highbd_inv_txfm_sse4.c:get_eobx_eoby_scan_v_identity:
  213|  58.7k|                                                 TX_SIZE tx_size, int eob) {
  214|  58.7k|  eob -= 1;
  215|  58.7k|  const int txfm_size_row = tx_size_high[tx_size];
  216|  58.7k|  const int eoby_max = AOMMIN(32, txfm_size_row) - 1;
  ------------------
  |  |   34|  58.7k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 58.7k]
  |  |  ------------------
  ------------------
  217|  58.7k|  *eobx = eob_fill[eob / (eoby_max + 1)];
  218|  58.7k|  *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
  ------------------
  |  Branch (218:11): [True: 39.0k, False: 19.7k]
  ------------------
  219|  58.7k|}
highbd_inv_txfm_sse4.c:get_eobx_eoby_scan_h_identity:
  202|   177k|                                                 TX_SIZE tx_size, int eob) {
  203|   177k|  eob -= 1;
  204|   177k|  const int txfm_size_col = tx_size_wide[tx_size];
  205|   177k|  const int eobx_max = AOMMIN(32, txfm_size_col) - 1;
  ------------------
  |  |   34|   177k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 177k]
  |  |  ------------------
  ------------------
  206|   177k|  *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob];
  ------------------
  |  Branch (206:11): [True: 104k, False: 73.0k]
  ------------------
  207|   177k|  const int temp_eoby = eob / (eobx_max + 1);
  208|   177k|  assert(temp_eoby < 32);
  209|   177k|  *eoby = eob_fill[temp_eoby];
  210|   177k|}
av1_inv_txfm_avx2.c:get_eobx_eoby_scan_default:
  182|  1.51M|                                              TX_SIZE tx_size, int eob) {
  183|  1.51M|  if (eob == 1) {
  ------------------
  |  Branch (183:7): [True: 482k, False: 1.03M]
  ------------------
  184|   482k|    *eobx = 0;
  185|   482k|    *eoby = 0;
  186|   482k|    return;
  187|   482k|  }
  188|       |
  189|  1.03M|  const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
  190|  1.03M|  const int eob_row = (eob - 1) >> tx_w_log2;
  191|  1.03M|  const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
  192|  1.03M|  *eobx = eobxy & 0xFF;
  193|  1.03M|  *eoby = eobxy >> 8;
  194|  1.03M|}
av1_inv_txfm_avx2.c:get_eobx_eoby_scan_h_identity:
  202|  5.30k|                                                 TX_SIZE tx_size, int eob) {
  203|  5.30k|  eob -= 1;
  204|  5.30k|  const int txfm_size_col = tx_size_wide[tx_size];
  205|  5.30k|  const int eobx_max = AOMMIN(32, txfm_size_col) - 1;
  ------------------
  |  |   34|  5.30k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 5.30k]
  |  |  ------------------
  ------------------
  206|  5.30k|  *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob];
  ------------------
  |  Branch (206:11): [True: 2.51k, False: 2.78k]
  ------------------
  207|  5.30k|  const int temp_eoby = eob / (eobx_max + 1);
  208|  5.30k|  assert(temp_eoby < 32);
  209|  5.30k|  *eoby = eob_fill[temp_eoby];
  210|  5.30k|}
av1_inv_txfm_avx2.c:get_eobx_eoby_scan_v_identity:
  213|  18.3k|                                                 TX_SIZE tx_size, int eob) {
  214|  18.3k|  eob -= 1;
  215|  18.3k|  const int txfm_size_row = tx_size_high[tx_size];
  216|  18.3k|  const int eoby_max = AOMMIN(32, txfm_size_row) - 1;
  ------------------
  |  |   34|  18.3k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 18.3k]
  |  |  ------------------
  ------------------
  217|  18.3k|  *eobx = eob_fill[eob / (eoby_max + 1)];
  218|  18.3k|  *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
  ------------------
  |  Branch (218:11): [True: 10.0k, False: 8.31k]
  ------------------
  219|  18.3k|}
highbd_inv_txfm_avx2.c:get_eobx_eoby_scan_default:
  182|  4.56M|                                              TX_SIZE tx_size, int eob) {
  183|  4.56M|  if (eob == 1) {
  ------------------
  |  Branch (183:7): [True: 1.53M, False: 3.03M]
  ------------------
  184|  1.53M|    *eobx = 0;
  185|  1.53M|    *eoby = 0;
  186|  1.53M|    return;
  187|  1.53M|  }
  188|       |
  189|  3.03M|  const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
  190|  3.03M|  const int eob_row = (eob - 1) >> tx_w_log2;
  191|  3.03M|  const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
  192|  3.03M|  *eobx = eobxy & 0xFF;
  193|  3.03M|  *eoby = eobxy >> 8;
  194|  3.03M|}

av1_inv_txfm_ssse3.c:load_32bit_to_16bit:
   87|  17.2M|static inline __m128i load_32bit_to_16bit(const int32_t *a) {
   88|  17.2M|  const __m128i a_low = _mm_load_si128((const __m128i *)a);
   89|  17.2M|  return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
   90|  17.2M|}
av1_inv_txfm_ssse3.c:load_buffer_32bit_to_16bit:
  173|  2.12M|                                              __m128i *out, int out_size) {
  174|  16.5M|  for (int i = 0; i < out_size; ++i) {
  ------------------
  |  Branch (174:19): [True: 14.3M, False: 2.12M]
  ------------------
  175|  14.3M|    out[i] = load_32bit_to_16bit(in + i * stride);
  176|  14.3M|  }
  177|  2.12M|}
av1_inv_txfm_ssse3.c:flip_buf_sse2:
  253|   222k|static inline void flip_buf_sse2(__m128i *in, __m128i *out, int size) {
  254|  1.72M|  for (int i = 0; i < size; ++i) {
  ------------------
  |  Branch (254:19): [True: 1.50M, False: 222k]
  ------------------
  255|  1.50M|    out[size - i - 1] = in[i];
  256|  1.50M|  }
  257|   222k|}
av1_inv_txfm_ssse3.c:load_buffer_32bit_to_16bit_w4:
  180|  1.64M|                                                 __m128i *out, int out_size) {
  181|  16.5M|  for (int i = 0; i < out_size; ++i) {
  ------------------
  |  Branch (181:19): [True: 14.9M, False: 1.64M]
  ------------------
  182|  14.9M|    out[i] = load_32bit_to_16bit_w4(in + i * stride);
  183|  14.9M|  }
  184|  1.64M|}
av1_inv_txfm_ssse3.c:load_32bit_to_16bit_w4:
   92|  14.9M|static inline __m128i load_32bit_to_16bit_w4(const int32_t *a) {
   93|  14.9M|  const __m128i a_low = _mm_load_si128((const __m128i *)a);
   94|  14.9M|  return _mm_packs_epi32(a_low, a_low);
   95|  14.9M|}
highbd_inv_txfm_sse4.c:flip_buf_sse2:
  253|   141k|static inline void flip_buf_sse2(__m128i *in, __m128i *out, int size) {
  254|  1.65M|  for (int i = 0; i < size; ++i) {
  ------------------
  |  Branch (254:19): [True: 1.51M, False: 141k]
  ------------------
  255|  1.51M|    out[size - i - 1] = in[i];
  256|  1.51M|  }
  257|   141k|}

highbd_inv_txfm_sse4.c:av1_round_shift_rect_array_32_sse4_1:
   49|  1.93M|                                                        const int val) {
   50|  1.93M|  const __m128i sqrt2 = _mm_set1_epi32(val);
   51|  1.93M|  if (bit > 0) {
  ------------------
  |  Branch (51:7): [True: 0, False: 1.93M]
  ------------------
   52|      0|    int i;
   53|      0|    for (i = 0; i < size; i++) {
  ------------------
  |  Branch (53:17): [True: 0, False: 0]
  ------------------
   54|      0|      const __m128i r0 = av1_round_shift_32_sse4_1(input[i], bit);
   55|      0|      const __m128i r1 = _mm_mullo_epi32(sqrt2, r0);
   56|      0|      output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits);
  ------------------
  |  |   41|      0|#define NewSqrt2Bits ((int32_t)12)
  ------------------
   57|      0|    }
   58|  1.93M|  } else {
   59|  1.93M|    int i;
   60|  19.3M|    for (i = 0; i < size; i++) {
  ------------------
  |  Branch (60:17): [True: 17.3M, False: 1.93M]
  ------------------
   61|  17.3M|      const __m128i r0 = _mm_slli_epi32(input[i], -bit);
   62|  17.3M|      const __m128i r1 = _mm_mullo_epi32(sqrt2, r0);
   63|  17.3M|      output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits);
  ------------------
  |  |   41|  17.3M|#define NewSqrt2Bits ((int32_t)12)
  ------------------
   64|  17.3M|    }
   65|  1.93M|  }
   66|  1.93M|}
highbd_inv_txfm_sse4.c:av1_round_shift_32_sse4_1:
   21|  65.9M|static inline __m128i av1_round_shift_32_sse4_1(__m128i vec, int bit) {
   22|  65.9M|  __m128i tmp, round;
   23|  65.9M|  round = _mm_set1_epi32(1 << (bit - 1));
   24|  65.9M|  tmp = _mm_add_epi32(vec, round);
   25|  65.9M|  return _mm_srai_epi32(tmp, bit);
   26|  65.9M|}
highbd_inv_txfm_sse4.c:av1_round_shift_array_32_sse4_1:
   31|  4.28M|                                                   const int bit) {
   32|  4.28M|  if (bit > 0) {
  ------------------
  |  Branch (32:7): [True: 4.28M, False: 18.4E]
  ------------------
   33|  4.28M|    int i;
   34|  52.8M|    for (i = 0; i < size; i++) {
  ------------------
  |  Branch (34:17): [True: 48.5M, False: 4.28M]
  ------------------
   35|  48.5M|      output[i] = av1_round_shift_32_sse4_1(input[i], bit);
   36|  48.5M|    }
   37|  18.4E|  } else {
   38|  18.4E|    int i;
   39|  18.4E|    for (i = 0; i < size; i++) {
  ------------------
  |  Branch (39:17): [True: 0, False: 18.4E]
  ------------------
   40|      0|      output[i] = _mm_slli_epi32(input[i], -bit);
   41|      0|    }
   42|  18.4E|  }
   43|  4.28M|}

cdef_find_dir_dual_avx2:
  182|  13.3M|                             int *out_dir_1st_8x8, int *out_dir_2nd_8x8) {
  183|  13.3M|  int32_t cost_first_8x8[8];
  184|  13.3M|  int32_t cost_second_8x8[8];
  185|       |  // Used to store the best cost for 2 8x8's.
  186|  13.3M|  int32_t best_cost[2] = { 0 };
  187|       |  // Best direction for 2 8x8's.
  188|  13.3M|  int best_dir[2] = { 0 };
  189|       |
  190|  13.3M|  const __m128i const_coeff_shift_reg = _mm_cvtsi32_si128(coeff_shift);
  191|  13.3M|  const __m256i const_128_reg = _mm256_set1_epi16(128);
  192|  13.3M|  __m256i lines[8];
  193|   120M|  for (int i = 0; i < 8; i++) {
  ------------------
  |  Branch (193:19): [True: 106M, False: 13.3M]
  ------------------
  194|   106M|    const __m128i src_1 = _mm_loadu_si128((const __m128i *)&img1[i * stride]);
  195|   106M|    const __m128i src_2 = _mm_loadu_si128((const __m128i *)&img2[i * stride]);
  196|       |
  197|   106M|    lines[i] = _mm256_insertf128_si256(_mm256_castsi128_si256(src_1), src_2, 1);
  198|   106M|    lines[i] = _mm256_sub_epi16(
  199|   106M|        _mm256_sra_epi16(lines[i], const_coeff_shift_reg), const_128_reg);
  200|   106M|  }
  201|       |
  202|       |  /* Compute "mostly vertical" directions. */
  203|  13.3M|  const __m256i dir47 =
  204|  13.3M|      compute_directions_avx2(lines, cost_first_8x8 + 4, cost_second_8x8 + 4);
  205|       |
  206|       |  /* Transpose and reverse the order of the lines. */
  207|  13.3M|  array_reverse_transpose_8x8_avx2(lines, lines);
  208|       |
  209|       |  /* Compute "mostly horizontal" directions. */
  210|  13.3M|  const __m256i dir03 =
  211|  13.3M|      compute_directions_avx2(lines, cost_first_8x8, cost_second_8x8);
  212|       |
  213|  13.3M|  __m256i max = _mm256_max_epi32(dir03, dir47);
  214|  13.3M|  max =
  215|  13.3M|      _mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 8),
  216|  13.3M|                                            _mm256_slli_si256(max, 16 - (8))));
  217|  13.3M|  max =
  218|  13.3M|      _mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 4),
  219|  13.3M|                                            _mm256_slli_si256(max, 16 - (4))));
  220|       |
  221|  13.3M|  const __m128i first_8x8_output = _mm256_castsi256_si128(max);
  222|  13.3M|  const __m128i second_8x8_output = _mm256_extractf128_si256(max, 1);
  223|  13.3M|  const __m128i cmpeg_res_00 =
  224|  13.3M|      _mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir47));
  225|  13.3M|  const __m128i cmpeg_res_01 =
  226|  13.3M|      _mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir03));
  227|  13.3M|  const __m128i cmpeg_res_10 =
  228|  13.3M|      _mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir47, 1));
  229|  13.3M|  const __m128i cmpeg_res_11 =
  230|  13.3M|      _mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir03, 1));
  231|  13.3M|  const __m128i t_first_8x8 = _mm_packs_epi32(cmpeg_res_01, cmpeg_res_00);
  232|  13.3M|  const __m128i t_second_8x8 = _mm_packs_epi32(cmpeg_res_11, cmpeg_res_10);
  233|       |
  234|  13.3M|  best_cost[0] = _mm_cvtsi128_si32(_mm256_castsi256_si128(max));
  235|  13.3M|  best_cost[1] = _mm_cvtsi128_si32(second_8x8_output);
  236|  13.3M|  best_dir[0] = _mm_movemask_epi8(_mm_packs_epi16(t_first_8x8, t_first_8x8));
  237|  13.3M|  best_dir[0] =
  238|  13.3M|      get_msb(best_dir[0] ^ (best_dir[0] - 1));  // Count trailing zeros
  239|  13.3M|  best_dir[1] = _mm_movemask_epi8(_mm_packs_epi16(t_second_8x8, t_second_8x8));
  240|  13.3M|  best_dir[1] =
  241|  13.3M|      get_msb(best_dir[1] ^ (best_dir[1] - 1));  // Count trailing zeros
  242|       |
  243|       |  /* Difference between the optimal variance and the variance along the
  244|       |     orthogonal direction. Again, the sum(x^2) terms cancel out. */
  245|  13.3M|  *var_out_1st = best_cost[0] - cost_first_8x8[(best_dir[0] + 4) & 7];
  246|  13.3M|  *var_out_2nd = best_cost[1] - cost_second_8x8[(best_dir[1] + 4) & 7];
  247|       |
  248|       |  /* We'd normally divide by 840, but dividing by 1024 is close enough
  249|       |  for what we're going to do with this. */
  250|  13.3M|  *var_out_1st >>= 10;
  251|  13.3M|  *var_out_2nd >>= 10;
  252|  13.3M|  *out_dir_1st_8x8 = best_dir[0];
  253|  13.3M|  *out_dir_2nd_8x8 = best_dir[1];
  254|  13.3M|}
cdef_copy_rect8_8bit_to_16bit_avx2:
  258|   868k|                                        int width, int height) {
  259|   868k|  int j = 0;
  260|   868k|  int remaining_width = width;
  261|   868k|  assert(height % 2 == 0);
  262|   869k|  assert(height > 0);
  263|   869k|  assert(width > 0);
  264|       |
  265|       |  // Process multiple 32 pixels at a time.
  266|   869k|  if (remaining_width > 31) {
  ------------------
  |  Branch (266:7): [True: 850k, False: 18.8k]
  ------------------
  267|   850k|    int i = 0;
  268|  14.0M|    do {
  269|  14.0M|      j = 0;
  270|  23.0M|      do {
  271|  23.0M|        __m128i row00 =
  272|  23.0M|            _mm_loadu_si128((const __m128i *)&src[(i + 0) * sstride + (j + 0)]);
  273|  23.0M|        __m128i row01 = _mm_loadu_si128(
  274|  23.0M|            (const __m128i *)&src[(i + 0) * sstride + (j + 16)]);
  275|  23.0M|        __m128i row10 =
  276|  23.0M|            _mm_loadu_si128((const __m128i *)&src[(i + 1) * sstride + (j + 0)]);
  277|  23.0M|        __m128i row11 = _mm_loadu_si128(
  278|  23.0M|            (const __m128i *)&src[(i + 1) * sstride + (j + 16)]);
  279|  23.0M|        _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + (j + 0)],
  280|  23.0M|                            _mm256_cvtepu8_epi16(row00));
  281|  23.0M|        _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + (j + 16)],
  282|  23.0M|                            _mm256_cvtepu8_epi16(row01));
  283|  23.0M|        _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + (j + 0)],
  284|  23.0M|                            _mm256_cvtepu8_epi16(row10));
  285|  23.0M|        _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + (j + 16)],
  286|  23.0M|                            _mm256_cvtepu8_epi16(row11));
  287|  23.0M|        j += 32;
  288|  23.0M|      } while (j <= width - 32);
  ------------------
  |  Branch (288:16): [True: 8.99M, False: 14.0M]
  ------------------
  289|  14.0M|      i += 2;
  290|  14.0M|    } while (i < height);
  ------------------
  |  Branch (290:14): [True: 13.1M, False: 850k]
  ------------------
  291|   850k|    remaining_width = width & 31;
  292|   850k|  }
  293|       |
  294|       |  // Process 16 pixels at a time.
  295|   869k|  if (remaining_width > 15) {
  ------------------
  |  Branch (295:7): [True: 47.5k, False: 821k]
  ------------------
  296|  47.5k|    int i = 0;
  297|   444k|    do {
  298|   444k|      __m128i row0 =
  299|   444k|          _mm_loadu_si128((const __m128i *)&src[(i + 0) * sstride + j]);
  300|   444k|      __m128i row1 =
  301|   444k|          _mm_loadu_si128((const __m128i *)&src[(i + 1) * sstride + j]);
  302|   444k|      _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + j],
  303|   444k|                          _mm256_cvtepu8_epi16(row0));
  304|   444k|      _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + j],
  305|   444k|                          _mm256_cvtepu8_epi16(row1));
  306|   444k|      i += 2;
  307|   444k|    } while (i < height);
  ------------------
  |  Branch (307:14): [True: 396k, False: 47.5k]
  ------------------
  308|  47.5k|    remaining_width = width & 15;
  309|  47.5k|    j += 16;
  310|  47.5k|  }
  311|       |
  312|       |  // Process 8 pixels at a time.
  313|   869k|  if (remaining_width > 7) {
  ------------------
  |  Branch (313:7): [True: 640k, False: 228k]
  ------------------
  314|   640k|    int i = 0;
  315|  13.2M|    do {
  316|  13.2M|      __m128i row0 =
  317|  13.2M|          _mm_loadl_epi64((const __m128i *)&src[(i + 0) * sstride + j]);
  318|  13.2M|      __m128i row1 =
  319|  13.2M|          _mm_loadl_epi64((const __m128i *)&src[(i + 1) * sstride + j]);
  320|  13.2M|      _mm_storeu_si128((__m128i *)&dst[(i + 0) * dstride + j],
  321|  13.2M|                       _mm_unpacklo_epi8(row0, _mm_setzero_si128()));
  322|  13.2M|      _mm_storeu_si128((__m128i *)&dst[(i + 1) * dstride + j],
  323|  13.2M|                       _mm_unpacklo_epi8(row1, _mm_setzero_si128()));
  324|  13.2M|      i += 2;
  325|  13.2M|    } while (i < height);
  ------------------
  |  Branch (325:14): [True: 12.5M, False: 640k]
  ------------------
  326|   640k|    remaining_width = width & 7;
  327|   640k|    j += 8;
  328|   640k|  }
  329|       |
  330|       |  // Process 4 pixels at a time.
  331|   869k|  if (remaining_width > 3) {
  ------------------
  |  Branch (331:7): [True: 15.6k, False: 853k]
  ------------------
  332|  15.6k|    int i = 0;
  333|   190k|    do {
  334|   190k|      __m128i row0 =
  335|   190k|          _mm_cvtsi32_si128(*((const int32_t *)&src[(i + 0) * sstride + j]));
  336|   190k|      __m128i row1 =
  337|   190k|          _mm_cvtsi32_si128(*((const int32_t *)&src[(i + 1) * sstride + j]));
  338|   190k|      _mm_storel_epi64((__m128i *)&dst[(i + 0) * dstride + j],
  339|   190k|                       _mm_unpacklo_epi8(row0, _mm_setzero_si128()));
  340|   190k|      _mm_storel_epi64((__m128i *)&dst[(i + 1) * dstride + j],
  341|   190k|                       _mm_unpacklo_epi8(row1, _mm_setzero_si128()));
  342|   190k|      i += 2;
  343|   190k|    } while (i < height);
  ------------------
  |  Branch (343:14): [True: 175k, False: 15.6k]
  ------------------
  344|  15.6k|    remaining_width = width & 3;
  345|  15.6k|    j += 4;
  346|  15.6k|  }
  347|       |
  348|       |  // Process the remaining pixels.
  349|   869k|  if (remaining_width) {
  ------------------
  |  Branch (349:7): [True: 0, False: 869k]
  ------------------
  350|      0|    for (int i = 0; i < height; i++) {
  ------------------
  |  Branch (350:21): [True: 0, False: 0]
  ------------------
  351|      0|      for (int k = j; k < width; k++) {
  ------------------
  |  Branch (351:23): [True: 0, False: 0]
  ------------------
  352|      0|        dst[i * dstride + k] = src[i * sstride + k];
  353|      0|      }
  354|      0|    }
  355|      0|  }
  356|   869k|}
cdef_block_avx2.c:compute_directions_avx2:
   70|  26.8M|                                              int32_t cost_second_8x8[4]) {
   71|  26.8M|  __m256i partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
   72|  26.8M|  __m256i partial6;
   73|  26.8M|  __m256i tmp;
   74|       |  /* Partial sums for lines 0 and 1. */
   75|  26.8M|  partial4a = _mm256_slli_si256(lines[0], 14);
   76|  26.8M|  partial4b = _mm256_srli_si256(lines[0], 2);
   77|  26.8M|  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[1], 12));
   78|  26.8M|  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[1], 4));
   79|  26.8M|  tmp = _mm256_add_epi16(lines[0], lines[1]);
   80|  26.8M|  partial5a = _mm256_slli_si256(tmp, 10);
   81|  26.8M|  partial5b = _mm256_srli_si256(tmp, 6);
   82|  26.8M|  partial7a = _mm256_slli_si256(tmp, 4);
   83|  26.8M|  partial7b = _mm256_srli_si256(tmp, 12);
   84|  26.8M|  partial6 = tmp;
   85|       |
   86|       |  /* Partial sums for lines 2 and 3. */
   87|  26.8M|  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[2], 10));
   88|  26.8M|  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[2], 6));
   89|  26.8M|  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[3], 8));
   90|  26.8M|  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[3], 8));
   91|  26.8M|  tmp = _mm256_add_epi16(lines[2], lines[3]);
   92|  26.8M|  partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 8));
   93|  26.8M|  partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 8));
   94|  26.8M|  partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 6));
   95|  26.8M|  partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 10));
   96|  26.8M|  partial6 = _mm256_add_epi16(partial6, tmp);
   97|       |
   98|       |  /* Partial sums for lines 4 and 5. */
   99|  26.8M|  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[4], 6));
  100|  26.8M|  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[4], 10));
  101|  26.8M|  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[5], 4));
  102|  26.8M|  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[5], 12));
  103|  26.8M|  tmp = _mm256_add_epi16(lines[4], lines[5]);
  104|  26.8M|  partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 6));
  105|  26.8M|  partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 10));
  106|  26.8M|  partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 8));
  107|  26.8M|  partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 8));
  108|  26.8M|  partial6 = _mm256_add_epi16(partial6, tmp);
  109|       |
  110|       |  /* Partial sums for lines 6 and 7. */
  111|  26.8M|  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[6], 2));
  112|  26.8M|  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[6], 14));
  113|  26.8M|  partial4a = _mm256_add_epi16(partial4a, lines[7]);
  114|  26.8M|  tmp = _mm256_add_epi16(lines[6], lines[7]);
  115|  26.8M|  partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 4));
  116|  26.8M|  partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 12));
  117|  26.8M|  partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 10));
  118|  26.8M|  partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 6));
  119|  26.8M|  partial6 = _mm256_add_epi16(partial6, tmp);
  120|       |
  121|  26.8M|  const __m256i const_reg_1 =
  122|  26.8M|      _mm256_set_epi32(210, 280, 420, 840, 210, 280, 420, 840);
  123|  26.8M|  const __m256i const_reg_2 =
  124|  26.8M|      _mm256_set_epi32(105, 120, 140, 168, 105, 120, 140, 168);
  125|  26.8M|  const __m256i const_reg_3 = _mm256_set_epi32(210, 420, 0, 0, 210, 420, 0, 0);
  126|  26.8M|  const __m256i const_reg_4 =
  127|  26.8M|      _mm256_set_epi32(105, 105, 105, 140, 105, 105, 105, 140);
  128|       |
  129|       |  /* Compute costs in terms of partial sums. */
  130|  26.8M|  partial4a =
  131|  26.8M|      fold_mul_and_sum_avx2(&partial4a, &partial4b, &const_reg_1, &const_reg_2);
  132|  26.8M|  partial7a =
  133|  26.8M|      fold_mul_and_sum_avx2(&partial7a, &partial7b, &const_reg_3, &const_reg_4);
  134|  26.8M|  partial5a =
  135|  26.8M|      fold_mul_and_sum_avx2(&partial5a, &partial5b, &const_reg_3, &const_reg_4);
  136|  26.8M|  partial6 = _mm256_madd_epi16(partial6, partial6);
  137|  26.8M|  partial6 = _mm256_mullo_epi32(partial6, _mm256_set1_epi32(105));
  138|       |
  139|  26.8M|  partial4a = hsum4_avx2(&partial4a, &partial5a, &partial6, &partial7a);
  140|  26.8M|  _mm_storeu_si128((__m128i *)cost_frist_8x8,
  141|  26.8M|                   _mm256_castsi256_si128(partial4a));
  142|  26.8M|  _mm_storeu_si128((__m128i *)cost_second_8x8,
  143|  26.8M|                   _mm256_extractf128_si256(partial4a, 1));
  144|       |
  145|  26.8M|  return partial4a;
  146|  26.8M|}
cdef_block_avx2.c:fold_mul_and_sum_avx2:
   25|  80.3M|                                            const __m256i *const2) {
   26|       |  // Mask used to shuffle the elements present in 256bit register.
   27|  80.3M|  static const int shuffle_reg_256bit[8] = { 0x0b0a0d0c, 0x07060908, 0x03020504,
   28|  80.3M|                                             0x0f0e0100, 0x0b0a0d0c, 0x07060908,
   29|  80.3M|                                             0x03020504, 0x0f0e0100 };
   30|  80.3M|  __m256i tmp;
   31|       |  /* Reverse partial B. */
   32|  80.3M|  *partialb = _mm256_shuffle_epi8(
   33|  80.3M|      *partialb, _mm256_loadu_si256((const __m256i *)shuffle_reg_256bit));
   34|       |
   35|       |  /* Interleave the x and y values of identical indices and pair x8 with 0. */
   36|  80.3M|  tmp = *partiala;
   37|  80.3M|  *partiala = _mm256_unpacklo_epi16(*partiala, *partialb);
   38|  80.3M|  *partialb = _mm256_unpackhi_epi16(tmp, *partialb);
   39|       |
   40|       |  /* Square and add the corresponding x and y values. */
   41|  80.3M|  *partiala = _mm256_madd_epi16(*partiala, *partiala);
   42|  80.3M|  *partialb = _mm256_madd_epi16(*partialb, *partialb);
   43|       |  /* Multiply by constant. */
   44|  80.3M|  *partiala = _mm256_mullo_epi32(*partiala, *const1);
   45|  80.3M|  *partialb = _mm256_mullo_epi32(*partialb, *const2);
   46|       |  /* Sum all results. */
   47|  80.3M|  *partiala = _mm256_add_epi32(*partiala, *partialb);
   48|  80.3M|  return *partiala;
   49|  80.3M|}
cdef_block_avx2.c:hsum4_avx2:
   52|  26.8M|                                 __m256i *x3) {
   53|  26.8M|  const __m256i t0 = _mm256_unpacklo_epi32(*x0, *x1);
   54|  26.8M|  const __m256i t1 = _mm256_unpacklo_epi32(*x2, *x3);
   55|  26.8M|  const __m256i t2 = _mm256_unpackhi_epi32(*x0, *x1);
   56|  26.8M|  const __m256i t3 = _mm256_unpackhi_epi32(*x2, *x3);
   57|       |
   58|  26.8M|  *x0 = _mm256_unpacklo_epi64(t0, t1);
   59|  26.8M|  *x1 = _mm256_unpackhi_epi64(t0, t1);
   60|  26.8M|  *x2 = _mm256_unpacklo_epi64(t2, t3);
   61|  26.8M|  *x3 = _mm256_unpackhi_epi64(t2, t3);
   62|  26.8M|  return _mm256_add_epi32(_mm256_add_epi32(*x0, *x1),
   63|  26.8M|                          _mm256_add_epi32(*x2, *x3));
   64|  26.8M|}
cdef_block_avx2.c:array_reverse_transpose_8x8_avx2:
  150|  13.4M|static inline void array_reverse_transpose_8x8_avx2(__m256i *in, __m256i *res) {
  151|  13.4M|  const __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
  152|  13.4M|  const __m256i tr0_1 = _mm256_unpacklo_epi16(in[2], in[3]);
  153|  13.4M|  const __m256i tr0_2 = _mm256_unpackhi_epi16(in[0], in[1]);
  154|  13.4M|  const __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
  155|  13.4M|  const __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
  156|  13.4M|  const __m256i tr0_5 = _mm256_unpacklo_epi16(in[6], in[7]);
  157|  13.4M|  const __m256i tr0_6 = _mm256_unpackhi_epi16(in[4], in[5]);
  158|  13.4M|  const __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
  159|       |
  160|  13.4M|  const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1);
  161|  13.4M|  const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_4, tr0_5);
  162|  13.4M|  const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1);
  163|  13.4M|  const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_4, tr0_5);
  164|  13.4M|  const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_2, tr0_3);
  165|  13.4M|  const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7);
  166|  13.4M|  const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_2, tr0_3);
  167|  13.4M|  const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7);
  168|       |
  169|  13.4M|  res[7] = _mm256_unpacklo_epi64(tr1_0, tr1_1);
  170|  13.4M|  res[6] = _mm256_unpackhi_epi64(tr1_0, tr1_1);
  171|  13.4M|  res[5] = _mm256_unpacklo_epi64(tr1_2, tr1_3);
  172|  13.4M|  res[4] = _mm256_unpackhi_epi64(tr1_2, tr1_3);
  173|  13.4M|  res[3] = _mm256_unpacklo_epi64(tr1_4, tr1_5);
  174|  13.4M|  res[2] = _mm256_unpackhi_epi64(tr1_4, tr1_5);
  175|  13.4M|  res[1] = _mm256_unpacklo_epi64(tr1_6, tr1_7);
  176|  13.4M|  res[0] = _mm256_unpackhi_epi64(tr1_6, tr1_7);
  177|  13.4M|}

cfl_get_luma_subsampling_420_lbd_avx2:
   24|   961k|      TX_SIZE tx_size) {                                                       \
   25|   961k|    static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = {         \
   26|   961k|      cfl_subsample_##bd##_##sub##_4x4_ssse3,   /* 4x4 */                      \
   27|   961k|      cfl_subsample_##bd##_##sub##_8x8_ssse3,   /* 8x8 */                      \
   28|   961k|      cfl_subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */                    \
   29|   961k|      cfl_subsample_##bd##_##sub##_32x32_avx2,  /* 32x32 */                    \
   30|   961k|      NULL,                                     /* 64x64 (invalid CFL size) */ \
   31|   961k|      cfl_subsample_##bd##_##sub##_4x8_ssse3,   /* 4x8 */                      \
   32|   961k|      cfl_subsample_##bd##_##sub##_8x4_ssse3,   /* 8x4 */                      \
   33|   961k|      cfl_subsample_##bd##_##sub##_8x16_ssse3,  /* 8x16 */                     \
   34|   961k|      cfl_subsample_##bd##_##sub##_16x8_ssse3,  /* 16x8 */                     \
   35|   961k|      cfl_subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */                    \
   36|   961k|      cfl_subsample_##bd##_##sub##_32x16_avx2,  /* 32x16 */                    \
   37|   961k|      NULL,                                     /* 32x64 (invalid CFL size) */ \
   38|   961k|      NULL,                                     /* 64x32 (invalid CFL size) */ \
   39|   961k|      cfl_subsample_##bd##_##sub##_4x16_ssse3,  /* 4x16  */                    \
   40|   961k|      cfl_subsample_##bd##_##sub##_16x4_ssse3,  /* 16x4  */                    \
   41|   961k|      cfl_subsample_##bd##_##sub##_8x32_ssse3,  /* 8x32  */                    \
   42|   961k|      cfl_subsample_##bd##_##sub##_32x8_avx2,   /* 32x8  */                    \
   43|   961k|      NULL,                                     /* 16x64 (invalid CFL size) */ \
   44|   961k|      NULL,                                     /* 64x16 (invalid CFL size) */ \
   45|   961k|    };                                                                         \
   46|   961k|    return subfn_##sub[tx_size];                                               \
   47|   961k|  }
cfl_get_luma_subsampling_422_lbd_avx2:
   24|  2.22k|      TX_SIZE tx_size) {                                                       \
   25|  2.22k|    static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = {         \
   26|  2.22k|      cfl_subsample_##bd##_##sub##_4x4_ssse3,   /* 4x4 */                      \
   27|  2.22k|      cfl_subsample_##bd##_##sub##_8x8_ssse3,   /* 8x8 */                      \
   28|  2.22k|      cfl_subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */                    \
   29|  2.22k|      cfl_subsample_##bd##_##sub##_32x32_avx2,  /* 32x32 */                    \
   30|  2.22k|      NULL,                                     /* 64x64 (invalid CFL size) */ \
   31|  2.22k|      cfl_subsample_##bd##_##sub##_4x8_ssse3,   /* 4x8 */                      \
   32|  2.22k|      cfl_subsample_##bd##_##sub##_8x4_ssse3,   /* 8x4 */                      \
   33|  2.22k|      cfl_subsample_##bd##_##sub##_8x16_ssse3,  /* 8x16 */                     \
   34|  2.22k|      cfl_subsample_##bd##_##sub##_16x8_ssse3,  /* 16x8 */                     \
   35|  2.22k|      cfl_subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */                    \
   36|  2.22k|      cfl_subsample_##bd##_##sub##_32x16_avx2,  /* 32x16 */                    \
   37|  2.22k|      NULL,                                     /* 32x64 (invalid CFL size) */ \
   38|  2.22k|      NULL,                                     /* 64x32 (invalid CFL size) */ \
   39|  2.22k|      cfl_subsample_##bd##_##sub##_4x16_ssse3,  /* 4x16  */                    \
   40|  2.22k|      cfl_subsample_##bd##_##sub##_16x4_ssse3,  /* 16x4  */                    \
   41|  2.22k|      cfl_subsample_##bd##_##sub##_8x32_ssse3,  /* 8x32  */                    \
   42|  2.22k|      cfl_subsample_##bd##_##sub##_32x8_avx2,   /* 32x8  */                    \
   43|  2.22k|      NULL,                                     /* 16x64 (invalid CFL size) */ \
   44|  2.22k|      NULL,                                     /* 64x16 (invalid CFL size) */ \
   45|  2.22k|    };                                                                         \
   46|  2.22k|    return subfn_##sub[tx_size];                                               \
   47|  2.22k|  }
cfl_get_luma_subsampling_444_lbd_avx2:
   24|   421k|      TX_SIZE tx_size) {                                                       \
   25|   421k|    static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = {         \
   26|   421k|      cfl_subsample_##bd##_##sub##_4x4_ssse3,   /* 4x4 */                      \
   27|   421k|      cfl_subsample_##bd##_##sub##_8x8_ssse3,   /* 8x8 */                      \
   28|   421k|      cfl_subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */                    \
   29|   421k|      cfl_subsample_##bd##_##sub##_32x32_avx2,  /* 32x32 */                    \
   30|   421k|      NULL,                                     /* 64x64 (invalid CFL size) */ \
   31|   421k|      cfl_subsample_##bd##_##sub##_4x8_ssse3,   /* 4x8 */                      \
   32|   421k|      cfl_subsample_##bd##_##sub##_8x4_ssse3,   /* 8x4 */                      \
   33|   421k|      cfl_subsample_##bd##_##sub##_8x16_ssse3,  /* 8x16 */                     \
   34|   421k|      cfl_subsample_##bd##_##sub##_16x8_ssse3,  /* 16x8 */                     \
   35|   421k|      cfl_subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */                    \
   36|   421k|      cfl_subsample_##bd##_##sub##_32x16_avx2,  /* 32x16 */                    \
   37|   421k|      NULL,                                     /* 32x64 (invalid CFL size) */ \
   38|   421k|      NULL,                                     /* 64x32 (invalid CFL size) */ \
   39|   421k|      cfl_subsample_##bd##_##sub##_4x16_ssse3,  /* 4x16  */                    \
   40|   421k|      cfl_subsample_##bd##_##sub##_16x4_ssse3,  /* 16x4  */                    \
   41|   421k|      cfl_subsample_##bd##_##sub##_8x32_ssse3,  /* 8x32  */                    \
   42|   421k|      cfl_subsample_##bd##_##sub##_32x8_avx2,   /* 32x8  */                    \
   43|   421k|      NULL,                                     /* 16x64 (invalid CFL size) */ \
   44|   421k|      NULL,                                     /* 64x16 (invalid CFL size) */ \
   45|   421k|    };                                                                         \
   46|   421k|    return subfn_##sub[tx_size];                                               \
   47|   421k|  }
cfl_get_luma_subsampling_420_hbd_avx2:
   24|   878k|      TX_SIZE tx_size) {                                                       \
   25|   878k|    static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = {         \
   26|   878k|      cfl_subsample_##bd##_##sub##_4x4_ssse3,   /* 4x4 */                      \
   27|   878k|      cfl_subsample_##bd##_##sub##_8x8_ssse3,   /* 8x8 */                      \
   28|   878k|      cfl_subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */                    \
   29|   878k|      cfl_subsample_##bd##_##sub##_32x32_avx2,  /* 32x32 */                    \
   30|   878k|      NULL,                                     /* 64x64 (invalid CFL size) */ \
   31|   878k|      cfl_subsample_##bd##_##sub##_4x8_ssse3,   /* 4x8 */                      \
   32|   878k|      cfl_subsample_##bd##_##sub##_8x4_ssse3,   /* 8x4 */                      \
   33|   878k|      cfl_subsample_##bd##_##sub##_8x16_ssse3,  /* 8x16 */                     \
   34|   878k|      cfl_subsample_##bd##_##sub##_16x8_ssse3,  /* 16x8 */                     \
   35|   878k|      cfl_subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */                    \
   36|   878k|      cfl_subsample_##bd##_##sub##_32x16_avx2,  /* 32x16 */                    \
   37|   878k|      NULL,                                     /* 32x64 (invalid CFL size) */ \
   38|   878k|      NULL,                                     /* 64x32 (invalid CFL size) */ \
   39|   878k|      cfl_subsample_##bd##_##sub##_4x16_ssse3,  /* 4x16  */                    \
   40|   878k|      cfl_subsample_##bd##_##sub##_16x4_ssse3,  /* 16x4  */                    \
   41|   878k|      cfl_subsample_##bd##_##sub##_8x32_ssse3,  /* 8x32  */                    \
   42|   878k|      cfl_subsample_##bd##_##sub##_32x8_avx2,   /* 32x8  */                    \
   43|   878k|      NULL,                                     /* 16x64 (invalid CFL size) */ \
   44|   878k|      NULL,                                     /* 64x16 (invalid CFL size) */ \
   45|   878k|    };                                                                         \
   46|   878k|    return subfn_##sub[tx_size];                                               \
   47|   878k|  }
cfl_get_luma_subsampling_422_hbd_avx2:
   24|  1.88k|      TX_SIZE tx_size) {                                                       \
   25|  1.88k|    static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = {         \
   26|  1.88k|      cfl_subsample_##bd##_##sub##_4x4_ssse3,   /* 4x4 */                      \
   27|  1.88k|      cfl_subsample_##bd##_##sub##_8x8_ssse3,   /* 8x8 */                      \
   28|  1.88k|      cfl_subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */                    \
   29|  1.88k|      cfl_subsample_##bd##_##sub##_32x32_avx2,  /* 32x32 */                    \
   30|  1.88k|      NULL,                                     /* 64x64 (invalid CFL size) */ \
   31|  1.88k|      cfl_subsample_##bd##_##sub##_4x8_ssse3,   /* 4x8 */                      \
   32|  1.88k|      cfl_subsample_##bd##_##sub##_8x4_ssse3,   /* 8x4 */                      \
   33|  1.88k|      cfl_subsample_##bd##_##sub##_8x16_ssse3,  /* 8x16 */                     \
   34|  1.88k|      cfl_subsample_##bd##_##sub##_16x8_ssse3,  /* 16x8 */                     \
   35|  1.88k|      cfl_subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */                    \
   36|  1.88k|      cfl_subsample_##bd##_##sub##_32x16_avx2,  /* 32x16 */                    \
   37|  1.88k|      NULL,                                     /* 32x64 (invalid CFL size) */ \
   38|  1.88k|      NULL,                                     /* 64x32 (invalid CFL size) */ \
   39|  1.88k|      cfl_subsample_##bd##_##sub##_4x16_ssse3,  /* 4x16  */                    \
   40|  1.88k|      cfl_subsample_##bd##_##sub##_16x4_ssse3,  /* 16x4  */                    \
   41|  1.88k|      cfl_subsample_##bd##_##sub##_8x32_ssse3,  /* 8x32  */                    \
   42|  1.88k|      cfl_subsample_##bd##_##sub##_32x8_avx2,   /* 32x8  */                    \
   43|  1.88k|      NULL,                                     /* 16x64 (invalid CFL size) */ \
   44|  1.88k|      NULL,                                     /* 64x16 (invalid CFL size) */ \
   45|  1.88k|    };                                                                         \
   46|  1.88k|    return subfn_##sub[tx_size];                                               \
   47|  1.88k|  }
cfl_get_luma_subsampling_444_hbd_avx2:
   24|   767k|      TX_SIZE tx_size) {                                                       \
   25|   767k|    static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = {         \
   26|   767k|      cfl_subsample_##bd##_##sub##_4x4_ssse3,   /* 4x4 */                      \
   27|   767k|      cfl_subsample_##bd##_##sub##_8x8_ssse3,   /* 8x8 */                      \
   28|   767k|      cfl_subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */                    \
   29|   767k|      cfl_subsample_##bd##_##sub##_32x32_avx2,  /* 32x32 */                    \
   30|   767k|      NULL,                                     /* 64x64 (invalid CFL size) */ \
   31|   767k|      cfl_subsample_##bd##_##sub##_4x8_ssse3,   /* 4x8 */                      \
   32|   767k|      cfl_subsample_##bd##_##sub##_8x4_ssse3,   /* 8x4 */                      \
   33|   767k|      cfl_subsample_##bd##_##sub##_8x16_ssse3,  /* 8x16 */                     \
   34|   767k|      cfl_subsample_##bd##_##sub##_16x8_ssse3,  /* 16x8 */                     \
   35|   767k|      cfl_subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */                    \
   36|   767k|      cfl_subsample_##bd##_##sub##_32x16_avx2,  /* 32x16 */                    \
   37|   767k|      NULL,                                     /* 32x64 (invalid CFL size) */ \
   38|   767k|      NULL,                                     /* 64x32 (invalid CFL size) */ \
   39|   767k|      cfl_subsample_##bd##_##sub##_4x16_ssse3,  /* 4x16  */                    \
   40|   767k|      cfl_subsample_##bd##_##sub##_16x4_ssse3,  /* 16x4  */                    \
   41|   767k|      cfl_subsample_##bd##_##sub##_8x32_ssse3,  /* 8x32  */                    \
   42|   767k|      cfl_subsample_##bd##_##sub##_32x8_avx2,   /* 32x8  */                    \
   43|   767k|      NULL,                                     /* 16x64 (invalid CFL size) */ \
   44|   767k|      NULL,                                     /* 64x16 (invalid CFL size) */ \
   45|   767k|    };                                                                         \
   46|   767k|    return subfn_##sub[tx_size];                                               \
   47|   767k|  }
cfl_get_predict_lbd_fn_avx2:
  278|  1.40M|cfl_predict_lbd_fn cfl_get_predict_lbd_fn_avx2(TX_SIZE tx_size) {
  279|  1.40M|  static const cfl_predict_lbd_fn pred[TX_SIZES_ALL] = {
  280|  1.40M|    cfl_predict_lbd_4x4_ssse3,   /* 4x4 */
  281|  1.40M|    cfl_predict_lbd_8x8_ssse3,   /* 8x8 */
  282|  1.40M|    cfl_predict_lbd_16x16_ssse3, /* 16x16 */
  283|  1.40M|    cfl_predict_lbd_32x32_avx2,  /* 32x32 */
  284|  1.40M|    NULL,                        /* 64x64 (invalid CFL size) */
  285|  1.40M|    cfl_predict_lbd_4x8_ssse3,   /* 4x8 */
  286|  1.40M|    cfl_predict_lbd_8x4_ssse3,   /* 8x4 */
  287|  1.40M|    cfl_predict_lbd_8x16_ssse3,  /* 8x16 */
  288|  1.40M|    cfl_predict_lbd_16x8_ssse3,  /* 16x8 */
  289|  1.40M|    cfl_predict_lbd_16x32_ssse3, /* 16x32 */
  290|  1.40M|    cfl_predict_lbd_32x16_avx2,  /* 32x16 */
  291|  1.40M|    NULL,                        /* 32x64 (invalid CFL size) */
  292|  1.40M|    NULL,                        /* 64x32 (invalid CFL size) */
  293|  1.40M|    cfl_predict_lbd_4x16_ssse3,  /* 4x16  */
  294|  1.40M|    cfl_predict_lbd_16x4_ssse3,  /* 16x4  */
  295|  1.40M|    cfl_predict_lbd_8x32_ssse3,  /* 8x32  */
  296|  1.40M|    cfl_predict_lbd_32x8_avx2,   /* 32x8  */
  297|  1.40M|    NULL,                        /* 16x64 (invalid CFL size) */
  298|  1.40M|    NULL,                        /* 64x16 (invalid CFL size) */
  299|  1.40M|  };
  300|       |  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
  301|       |  // function pointer array out of bounds.
  302|  1.40M|  return pred[tx_size % TX_SIZES_ALL];
  303|  1.40M|}
cfl_get_predict_hbd_fn_avx2:
  352|  1.57M|cfl_predict_hbd_fn cfl_get_predict_hbd_fn_avx2(TX_SIZE tx_size) {
  353|  1.57M|  static const cfl_predict_hbd_fn pred[TX_SIZES_ALL] = {
  354|  1.57M|    cfl_predict_hbd_4x4_ssse3,  /* 4x4 */
  355|  1.57M|    cfl_predict_hbd_8x8_ssse3,  /* 8x8 */
  356|  1.57M|    cfl_predict_hbd_16x16_avx2, /* 16x16 */
  357|  1.57M|    cfl_predict_hbd_32x32_avx2, /* 32x32 */
  358|  1.57M|    NULL,                       /* 64x64 (invalid CFL size) */
  359|  1.57M|    cfl_predict_hbd_4x8_ssse3,  /* 4x8 */
  360|  1.57M|    cfl_predict_hbd_8x4_ssse3,  /* 8x4 */
  361|  1.57M|    cfl_predict_hbd_8x16_ssse3, /* 8x16 */
  362|  1.57M|    cfl_predict_hbd_16x8_avx2,  /* 16x8 */
  363|  1.57M|    cfl_predict_hbd_16x32_avx2, /* 16x32 */
  364|  1.57M|    cfl_predict_hbd_32x16_avx2, /* 32x16 */
  365|  1.57M|    NULL,                       /* 32x64 (invalid CFL size) */
  366|  1.57M|    NULL,                       /* 64x32 (invalid CFL size) */
  367|  1.57M|    cfl_predict_hbd_4x16_ssse3, /* 4x16  */
  368|  1.57M|    cfl_predict_hbd_16x4_avx2,  /* 16x4  */
  369|  1.57M|    cfl_predict_hbd_8x32_ssse3, /* 8x32  */
  370|  1.57M|    cfl_predict_hbd_32x8_avx2,  /* 32x8  */
  371|  1.57M|    NULL,                       /* 16x64 (invalid CFL size) */
  372|  1.57M|    NULL,                       /* 64x16 (invalid CFL size) */
  373|  1.57M|  };
  374|       |  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
  375|       |  // function pointer array out of bounds.
  376|  1.57M|  return pred[tx_size % TX_SIZES_ALL];
  377|  1.57M|}
cfl_get_subtract_average_fn_avx2:
  470|  1.48M|cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size) {
  471|  1.48M|  static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
  472|  1.48M|    cfl_subtract_average_4x4_sse2,   /* 4x4 */
  473|  1.48M|    cfl_subtract_average_8x8_sse2,   /* 8x8 */
  474|  1.48M|    cfl_subtract_average_16x16_avx2, /* 16x16 */
  475|  1.48M|    cfl_subtract_average_32x32_avx2, /* 32x32 */
  476|  1.48M|    NULL,                            /* 64x64 (invalid CFL size) */
  477|  1.48M|    cfl_subtract_average_4x8_sse2,   /* 4x8 */
  478|  1.48M|    cfl_subtract_average_8x4_sse2,   /* 8x4 */
  479|  1.48M|    cfl_subtract_average_8x16_sse2,  /* 8x16 */
  480|  1.48M|    cfl_subtract_average_16x8_avx2,  /* 16x8 */
  481|  1.48M|    cfl_subtract_average_16x32_avx2, /* 16x32 */
  482|  1.48M|    cfl_subtract_average_32x16_avx2, /* 32x16 */
  483|  1.48M|    NULL,                            /* 32x64 (invalid CFL size) */
  484|  1.48M|    NULL,                            /* 64x32 (invalid CFL size) */
  485|  1.48M|    cfl_subtract_average_4x16_sse2,  /* 4x16 */
  486|  1.48M|    cfl_subtract_average_16x4_avx2,  /* 16x4 */
  487|  1.48M|    cfl_subtract_average_8x32_sse2,  /* 8x32 */
  488|  1.48M|    cfl_subtract_average_32x8_avx2,  /* 32x8 */
  489|  1.48M|    NULL,                            /* 16x64 (invalid CFL size) */
  490|  1.48M|    NULL,                            /* 64x16 (invalid CFL size) */
  491|  1.48M|  };
  492|       |  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to
  493|       |  // index the function pointer array out of bounds.
  494|  1.48M|  return sub_avg[tx_size % TX_SIZES_ALL];
  495|  1.48M|}
cfl_avx2.c:cfl_luma_subsampling_420_lbd_avx2:
   64|  46.6k|                                              int height) {
   65|  46.6k|  (void)width;                               // Forever 32
   66|  46.6k|  const __m256i twos = _mm256_set1_epi8(2);  // Thirty two twos
   67|  46.6k|  const int luma_stride = input_stride << 1;
   68|  46.6k|  __m256i *row = (__m256i *)pred_buf_q3;
   69|  46.6k|  const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256;
  ------------------
  |  |  524|  46.6k|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|  46.6k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
   70|   503k|  do {
   71|   503k|    __m256i top = _mm256_loadu_si256((__m256i *)input);
   72|   503k|    __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride));
   73|       |
   74|   503k|    __m256i top_16x16 = _mm256_maddubs_epi16(top, twos);
   75|   503k|    __m256i bot_16x16 = _mm256_maddubs_epi16(bot, twos);
   76|   503k|    __m256i sum_16x16 = _mm256_add_epi16(top_16x16, bot_16x16);
   77|       |
   78|   503k|    _mm256_storeu_si256(row, sum_16x16);
   79|       |
   80|   503k|    input += luma_stride;
   81|   503k|  } while ((row += CFL_BUF_LINE_I256) < row_end);
  ------------------
  |  |  524|   503k|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|   503k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  |  Branch (81:12): [True: 456k, False: 46.6k]
  ------------------
   82|  46.6k|}
cfl_avx2.c:cfl_luma_subsampling_422_lbd_avx2:
   99|    425|                                              int height) {
  100|    425|  (void)width;                                // Forever 32
  101|    425|  const __m256i fours = _mm256_set1_epi8(4);  // Thirty two fours
  102|    425|  __m256i *row = (__m256i *)pred_buf_q3;
  103|    425|  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
  ------------------
  |  |  524|    425|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|    425|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  104|  7.00k|  do {
  105|  7.00k|    __m256i top = _mm256_loadu_si256((__m256i *)input);
  106|  7.00k|    __m256i top_16x16 = _mm256_maddubs_epi16(top, fours);
  107|  7.00k|    _mm256_storeu_si256(row, top_16x16);
  108|  7.00k|    input += input_stride;
  109|  7.00k|  } while ((row += CFL_BUF_LINE_I256) < row_end);
  ------------------
  |  |  524|  7.00k|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|  7.00k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  |  Branch (109:12): [True: 6.58k, False: 425]
  ------------------
  110|    425|}
cfl_avx2.c:cfl_luma_subsampling_444_lbd_avx2:
  127|  43.2k|                                              int height) {
  128|  43.2k|  (void)width;  // Forever 32
  129|  43.2k|  __m256i *row = (__m256i *)pred_buf_q3;
  130|  43.2k|  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
  ------------------
  |  |  524|  43.2k|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|  43.2k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  131|  43.2k|  const __m256i zeros = _mm256_setzero_si256();
  132|   848k|  do {
  133|   848k|    __m256i top = _mm256_loadu_si256((__m256i *)input);
  134|   848k|    top = _mm256_permute4x64_epi64(top, _MM_SHUFFLE(3, 1, 2, 0));
  135|       |
  136|   848k|    __m256i row_lo = _mm256_unpacklo_epi8(top, zeros);
  137|   848k|    row_lo = _mm256_slli_epi16(row_lo, 3);
  138|   848k|    __m256i row_hi = _mm256_unpackhi_epi8(top, zeros);
  139|   848k|    row_hi = _mm256_slli_epi16(row_hi, 3);
  140|       |
  141|   848k|    _mm256_storeu_si256(row, row_lo);
  142|   848k|    _mm256_storeu_si256(row + 1, row_hi);
  143|       |
  144|   848k|    input += input_stride;
  145|   848k|  } while ((row += CFL_BUF_LINE_I256) < row_end);
  ------------------
  |  |  524|   848k|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|   848k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  |  Branch (145:12): [True: 805k, False: 43.2k]
  ------------------
  146|  43.2k|}
cfl_avx2.c:cfl_luma_subsampling_420_hbd_avx2:
  166|  27.4k|                                              int height) {
  167|  27.4k|  (void)width;  // Forever 32
  168|  27.4k|  const int luma_stride = input_stride << 1;
  169|  27.4k|  __m256i *row = (__m256i *)pred_buf_q3;
  170|  27.4k|  const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256;
  ------------------
  |  |  524|  27.4k|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|  27.4k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  171|   291k|  do {
  172|   291k|    __m256i top = _mm256_loadu_si256((__m256i *)input);
  173|   291k|    __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride));
  174|   291k|    __m256i sum = _mm256_add_epi16(top, bot);
  175|       |
  176|   291k|    __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
  177|   291k|    __m256i bot_1 = _mm256_loadu_si256((__m256i *)(input + 16 + input_stride));
  178|   291k|    __m256i sum_1 = _mm256_add_epi16(top_1, bot_1);
  179|       |
  180|   291k|    __m256i hsum = _mm256_hadd_epi16(sum, sum_1);
  181|   291k|    hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0));
  182|   291k|    hsum = _mm256_add_epi16(hsum, hsum);
  183|       |
  184|   291k|    _mm256_storeu_si256(row, hsum);
  185|       |
  186|   291k|    input += luma_stride;
  187|   291k|  } while ((row += CFL_BUF_LINE_I256) < row_end);
  ------------------
  |  |  524|   291k|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|   291k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  |  Branch (187:12): [True: 263k, False: 27.4k]
  ------------------
  188|  27.4k|}
cfl_avx2.c:cfl_luma_subsampling_422_hbd_avx2:
  206|    119|                                              int height) {
  207|    119|  (void)width;  // Forever 32
  208|    119|  __m256i *row = (__m256i *)pred_buf_q3;
  209|    119|  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
  ------------------
  |  |  524|    119|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|    119|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  210|  1.78k|  do {
  211|  1.78k|    __m256i top = _mm256_loadu_si256((__m256i *)input);
  212|  1.78k|    __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
  213|  1.78k|    __m256i hsum = _mm256_hadd_epi16(top, top_1);
  214|  1.78k|    hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0));
  215|  1.78k|    hsum = _mm256_slli_epi16(hsum, 2);
  216|       |
  217|  1.78k|    _mm256_storeu_si256(row, hsum);
  218|       |
  219|  1.78k|    input += input_stride;
  220|  1.78k|  } while ((row += CFL_BUF_LINE_I256) < row_end);
  ------------------
  |  |  524|  1.78k|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|  1.78k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  |  Branch (220:12): [True: 1.66k, False: 119]
  ------------------
  221|    119|}
cfl_avx2.c:cfl_luma_subsampling_444_hbd_avx2:
  228|  61.2k|                                              int height) {
  229|  61.2k|  (void)width;  // Forever 32
  230|  61.2k|  __m256i *row = (__m256i *)pred_buf_q3;
  231|  61.2k|  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
  ------------------
  |  |  524|  61.2k|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|  61.2k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  232|  1.03M|  do {
  233|  1.03M|    __m256i top = _mm256_loadu_si256((__m256i *)input);
  234|  1.03M|    __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
  235|  1.03M|    _mm256_storeu_si256(row, _mm256_slli_epi16(top, 3));
  236|  1.03M|    _mm256_storeu_si256(row + 1, _mm256_slli_epi16(top_1, 3));
  237|  1.03M|    input += input_stride;
  238|  1.03M|  } while ((row += CFL_BUF_LINE_I256) < row_end);
  ------------------
  |  |  524|  1.03M|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|  1.03M|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  |  Branch (238:12): [True: 973k, False: 61.2k]
  ------------------
  239|  61.2k|}
cfl_avx2.c:cfl_predict_lbd_avx2:
  256|  87.2k|                                        int alpha_q3, int width, int height) {
  257|  87.2k|  (void)width;
  258|  87.2k|  const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
  259|  87.2k|  const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
  260|  87.2k|  const __m256i dc_q0 = _mm256_set1_epi16(*dst);
  261|  87.2k|  __m256i *row = (__m256i *)pred_buf_q3;
  262|  87.2k|  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
  ------------------
  |  |  524|  87.2k|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|  87.2k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  263|       |
  264|  1.71M|  do {
  265|  1.71M|    __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
  266|  1.71M|    __m256i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
  267|  1.71M|    res = _mm256_packus_epi16(res, next);
  268|  1.71M|    res = _mm256_permute4x64_epi64(res, _MM_SHUFFLE(3, 1, 2, 0));
  269|  1.71M|    _mm256_storeu_si256((__m256i *)dst, res);
  270|  1.71M|    dst += dst_stride;
  271|  1.71M|  } while ((row += CFL_BUF_LINE_I256) < row_end);
  ------------------
  |  |  524|  1.71M|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|  1.71M|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  |  Branch (271:12): [True: 1.62M, False: 87.2k]
  ------------------
  272|  87.2k|}
cfl_avx2.c:predict_unclipped:
  245|  12.0M|                                        __m256i alpha_sign, __m256i dc_q0) {
  246|  12.0M|  __m256i ac_q3 = _mm256_loadu_si256(input);
  247|  12.0M|  __m256i ac_sign = _mm256_sign_epi16(alpha_sign, ac_q3);
  248|  12.0M|  __m256i scaled_luma_q0 =
  249|  12.0M|      _mm256_mulhrs_epi16(_mm256_abs_epi16(ac_q3), alpha_q12);
  250|  12.0M|  scaled_luma_q0 = _mm256_sign_epi16(scaled_luma_q0, ac_sign);
  251|  12.0M|  return _mm256_add_epi16(scaled_luma_q0, dc_q0);
  252|  12.0M|}
cfl_avx2.c:cfl_predict_hbd_avx2:
  319|   538k|                                        int height) {
  320|       |  // Use SSSE3 version for smaller widths
  321|   538k|  assert(width == 16 || width == 32);
  322|   538k|  const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
  323|   538k|  const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
  324|   538k|  const __m256i dc_q0 = _mm256_loadu_si256((__m256i *)dst);
  325|   538k|  const __m256i max = highbd_max_epi16(bd);
  326|       |
  327|   538k|  __m256i *row = (__m256i *)pred_buf_q3;
  328|   538k|  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
  ------------------
  |  |  524|   538k|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|   538k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  329|  6.50M|  do {
  330|  6.50M|    const __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
  331|  6.50M|    _mm256_storeu_si256((__m256i *)dst,
  332|  6.50M|                        highbd_clamp_epi16(res, _mm256_setzero_si256(), max));
  333|  6.50M|    if (width == 32) {
  ------------------
  |  Branch (333:9): [True: 2.15M, False: 4.34M]
  ------------------
  334|  2.15M|      const __m256i res_1 =
  335|  2.15M|          predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
  336|  2.15M|      _mm256_storeu_si256(
  337|  2.15M|          (__m256i *)(dst + 16),
  338|  2.15M|          highbd_clamp_epi16(res_1, _mm256_setzero_si256(), max));
  339|  2.15M|    }
  340|  6.50M|    dst += dst_stride;
  341|  6.50M|  } while ((row += CFL_BUF_LINE_I256) < row_end);
  ------------------
  |  |  524|  6.50M|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|  6.50M|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  |  Branch (341:12): [True: 5.96M, False: 538k]
  ------------------
  342|   538k|}
cfl_avx2.c:highbd_max_epi16:
  306|   538k|static __m256i highbd_max_epi16(int bd) {
  307|   538k|  const __m256i neg_one = _mm256_set1_epi16(-1);
  308|       |  // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
  309|   538k|  return _mm256_xor_si256(_mm256_slli_epi16(neg_one, bd), neg_one);
  310|   538k|}
cfl_avx2.c:highbd_clamp_epi16:
  312|  8.65M|static __m256i highbd_clamp_epi16(__m256i u, __m256i zero, __m256i max) {
  313|  8.65M|  return _mm256_max_epi16(_mm256_min_epi16(u, max), zero);
  314|  8.65M|}
cfl_avx2.c:subtract_average_avx2:
  405|   480k|                                         int num_pel_log2) {
  406|       |  // Use SSE2 version for smaller widths
  407|   480k|  assert(width == 16 || width == 32);
  408|       |
  409|   480k|  const __m256i *src = (__m256i *)src_ptr;
  410|   480k|  const __m256i *const end = src + height * CFL_BUF_LINE_I256;
  ------------------
  |  |  524|   480k|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|   480k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  411|       |  // To maximize usage of the AVX2 registers, we sum two rows per loop
  412|       |  // iteration
  413|   480k|  const int step = 2 * CFL_BUF_LINE_I256;
  ------------------
  |  |  524|   480k|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|   480k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  414|       |
  415|   480k|  __m256i sum = _mm256_setzero_si256();
  416|       |  // For width 32, we use a second sum accumulator to reduce accumulator
  417|       |  // dependencies in the loop.
  418|   480k|  __m256i sum2;
  419|   480k|  if (width == 32) sum2 = _mm256_setzero_si256();
  ------------------
  |  Branch (419:7): [True: 107k, False: 373k]
  ------------------
  420|       |
  421|  2.99M|  do {
  422|       |    // Add top row to the bottom row
  423|  2.99M|    __m256i l0 = _mm256_add_epi16(_mm256_loadu_si256(src),
  424|  2.99M|                                  _mm256_loadu_si256(src + CFL_BUF_LINE_I256));
  ------------------
  |  |  524|  2.99M|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|  2.99M|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  425|  2.99M|    sum = _mm256_add_epi32(sum, _mm256_addl_epi16(l0));
  426|  2.99M|    if (width == 32) { /* Don't worry, this if it gets optimized out. */
  ------------------
  |  Branch (426:9): [True: 968k, False: 2.02M]
  ------------------
  427|       |      // Add the second part of the top row to the second part of the bottom row
  428|   968k|      __m256i l1 =
  429|   968k|          _mm256_add_epi16(_mm256_loadu_si256(src + 1),
  430|   968k|                           _mm256_loadu_si256(src + 1 + CFL_BUF_LINE_I256));
  ------------------
  |  |  524|   968k|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|   968k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  431|   968k|      sum2 = _mm256_add_epi32(sum2, _mm256_addl_epi16(l1));
  432|   968k|    }
  433|  2.99M|    src += step;
  434|  2.99M|  } while (src < end);
  ------------------
  |  Branch (434:12): [True: 2.51M, False: 480k]
  ------------------
  435|       |  // Combine both sum accumulators
  436|   480k|  if (width == 32) sum = _mm256_add_epi32(sum, sum2);
  ------------------
  |  Branch (436:7): [True: 107k, False: 373k]
  ------------------
  437|       |
  438|   480k|  __m256i fill = fill_sum_epi32(sum);
  439|       |
  440|   480k|  __m256i avg_epi16 = _mm256_srli_epi32(
  441|   480k|      _mm256_add_epi32(fill, _mm256_set1_epi32(round_offset)), num_pel_log2);
  442|   480k|  avg_epi16 = _mm256_packs_epi32(avg_epi16, avg_epi16);
  443|       |
  444|       |  // Store and subtract loop
  445|   480k|  src = (__m256i *)src_ptr;
  446|   480k|  __m256i *dst = (__m256i *)dst_ptr;
  447|  5.99M|  do {
  448|  5.99M|    _mm256_storeu_si256(dst,
  449|  5.99M|                        _mm256_sub_epi16(_mm256_loadu_si256(src), avg_epi16));
  450|  5.99M|    if (width == 32) {
  ------------------
  |  Branch (450:9): [True: 1.93M, False: 4.05M]
  ------------------
  451|  1.93M|      _mm256_storeu_si256(
  452|  1.93M|          dst + 1, _mm256_sub_epi16(_mm256_loadu_si256(src + 1), avg_epi16));
  453|  1.93M|    }
  454|  5.99M|    src += CFL_BUF_LINE_I256;
  ------------------
  |  |  524|  5.99M|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|  5.99M|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  455|  5.99M|    dst += CFL_BUF_LINE_I256;
  ------------------
  |  |  524|  5.99M|#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
  |  |  ------------------
  |  |  |  |  522|  5.99M|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  456|  5.99M|  } while (src < end);
  ------------------
  |  Branch (456:12): [True: 5.50M, False: 480k]
  ------------------
  457|   480k|}
cfl_avx2.c:_mm256_addl_epi16:
  397|  3.96M|static inline __m256i _mm256_addl_epi16(__m256i a) {
  398|  3.96M|  return _mm256_add_epi32(_mm256_unpacklo_epi16(a, _mm256_setzero_si256()),
  399|  3.96M|                          _mm256_unpackhi_epi16(a, _mm256_setzero_si256()));
  400|  3.96M|}
cfl_avx2.c:fill_sum_epi32:
  382|   480k|static inline __m256i fill_sum_epi32(__m256i a) {
  383|       |  // Given that a == [A, B, C, D, E, F, G, H]
  384|   480k|  a = _mm256_hadd_epi32(a, a);
  385|       |  // Given that A' == A + B, C' == C + D, E' == E + F, G' == G + H
  386|       |  // a == [A', C', A', C', E', G', E', G']
  387|   480k|  a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0));
  388|       |  // a == [A', C', E', G', A', C', E', G']
  389|   480k|  a = _mm256_hadd_epi32(a, a);
  390|       |  // Given that A'' == A' + C' and E'' == E' + G'
  391|       |  // a == [A'', E'', A'', E'', A'', E'', A'', E'']
  392|   480k|  return _mm256_hadd_epi32(a, a);
  393|       |  // Given that A''' == A'' + E''
  394|       |  // a == [A''', A''', A''', A''', A''', A''', A''', A''']
  395|   480k|}

cfl_sse2.c:subtract_average_sse2:
   25|  1.00M|                                         int num_pel_log2) {
   26|  1.00M|  const __m128i zeros = _mm_setzero_si128();
   27|  1.00M|  const __m128i round_offset_epi32 = _mm_set1_epi32(round_offset);
   28|  1.00M|  const __m128i *src = (__m128i *)src_ptr;
   29|  1.00M|  const __m128i *const end = src + height * CFL_BUF_LINE_I128;
  ------------------
  |  |  523|  1.00M|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|  1.00M|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
   30|  1.00M|  const int step = CFL_BUF_LINE_I128 * (1 + (width == 8) + 3 * (width == 4));
  ------------------
  |  |  523|  1.00M|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|  1.00M|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
   31|       |
   32|  1.00M|  __m128i sum = zeros;
   33|  3.54M|  do {
   34|  3.54M|    __m128i l0;
   35|  3.54M|    if (width == 4) {
  ------------------
  |  Branch (35:9): [True: 852k, False: 2.69M]
  ------------------
   36|   852k|      l0 = _mm_add_epi16(_mm_loadl_epi64(src),
   37|   852k|                         _mm_loadl_epi64(src + CFL_BUF_LINE_I128));
  ------------------
  |  |  523|   852k|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|   852k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
   38|   852k|      __m128i l1 = _mm_add_epi16(_mm_loadl_epi64(src + 2 * CFL_BUF_LINE_I128),
  ------------------
  |  |  523|   852k|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|   852k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
   39|   852k|                                 _mm_loadl_epi64(src + 3 * CFL_BUF_LINE_I128));
  ------------------
  |  |  523|   852k|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|   852k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
   40|   852k|      sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
   41|   852k|                                             _mm_unpacklo_epi16(l1, zeros)));
   42|  2.69M|    } else {
   43|  2.69M|      if (width == 8) {
  ------------------
  |  Branch (43:11): [True: 2.69M, False: 0]
  ------------------
   44|  2.69M|        l0 = _mm_add_epi16(_mm_loadu_si128(src),
   45|  2.69M|                           _mm_loadu_si128(src + CFL_BUF_LINE_I128));
  ------------------
  |  |  523|  2.69M|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|  2.69M|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
   46|  2.69M|      } else {
   47|      0|        l0 = _mm_add_epi16(_mm_loadu_si128(src), _mm_loadu_si128(src + 1));
   48|      0|      }
   49|  2.69M|      sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
   50|  2.69M|                                             _mm_unpackhi_epi16(l0, zeros)));
   51|  2.69M|      if (width == 32) {
  ------------------
  |  Branch (51:11): [True: 0, False: 2.69M]
  ------------------
   52|      0|        l0 = _mm_add_epi16(_mm_loadu_si128(src + 2), _mm_loadu_si128(src + 3));
   53|      0|        sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
   54|      0|                                               _mm_unpackhi_epi16(l0, zeros)));
   55|      0|      }
   56|  2.69M|    }
   57|  3.54M|    src += step;
   58|  3.54M|  } while (src < end);
  ------------------
  |  Branch (58:12): [True: 2.53M, False: 1.00M]
  ------------------
   59|       |
   60|  1.00M|  sum = fill_sum_epi32(sum);
   61|       |
   62|  1.00M|  __m128i avg_epi16 =
   63|  1.00M|      _mm_srli_epi32(_mm_add_epi32(sum, round_offset_epi32), num_pel_log2);
   64|  1.00M|  avg_epi16 = _mm_packs_epi32(avg_epi16, avg_epi16);
   65|       |
   66|  1.00M|  src = (__m128i *)src_ptr;
   67|  1.00M|  __m128i *dst = (__m128i *)dst_ptr;
   68|  8.79M|  do {
   69|  8.79M|    if (width == 4) {
  ------------------
  |  Branch (69:9): [True: 3.41M, False: 5.38M]
  ------------------
   70|  3.41M|      _mm_storel_epi64(dst, _mm_sub_epi16(_mm_loadl_epi64(src), avg_epi16));
   71|  5.38M|    } else {
   72|  5.38M|      _mm_storeu_si128(dst, _mm_sub_epi16(_mm_loadu_si128(src), avg_epi16));
   73|  5.38M|      if (width > 8) {
  ------------------
  |  Branch (73:11): [True: 0, False: 5.38M]
  ------------------
   74|      0|        _mm_storeu_si128(dst + 1,
   75|      0|                         _mm_sub_epi16(_mm_loadu_si128(src + 1), avg_epi16));
   76|      0|        if (width == 32) {
  ------------------
  |  Branch (76:13): [True: 0, False: 0]
  ------------------
   77|      0|          _mm_storeu_si128(dst + 2,
   78|      0|                           _mm_sub_epi16(_mm_loadu_si128(src + 2), avg_epi16));
   79|      0|          _mm_storeu_si128(dst + 3,
   80|      0|                           _mm_sub_epi16(_mm_loadu_si128(src + 3), avg_epi16));
   81|      0|        }
   82|      0|      }
   83|  5.38M|    }
   84|  8.79M|    src += CFL_BUF_LINE_I128;
  ------------------
  |  |  523|  8.79M|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|  8.79M|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
   85|  8.79M|    dst += CFL_BUF_LINE_I128;
  ------------------
  |  |  523|  8.79M|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|  8.79M|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
   86|  8.79M|  } while (src < end);
  ------------------
  |  Branch (86:12): [True: 7.78M, False: 1.00M]
  ------------------
   87|  1.00M|}
cfl_sse2.c:fill_sum_epi32:
   17|  1.00M|static inline __m128i fill_sum_epi32(__m128i l0) {
   18|  1.00M|  l0 = _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(1, 0, 3, 2)));
   19|  1.00M|  return _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(2, 3, 0, 1)));
   20|  1.00M|}

cfl_ssse3.c:cfl_luma_subsampling_420_lbd_ssse3:
   43|   914k|                                                      int width, int height) {
   44|   914k|  const __m128i twos = _mm_set1_epi8(2);
   45|   914k|  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
   46|   914k|  const __m128i *end = pred_buf_m128i + (height >> 1) * CFL_BUF_LINE_I128;
  ------------------
  |  |  523|   914k|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|   914k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
   47|   914k|  const int luma_stride = input_stride << 1;
   48|  3.73M|  do {
   49|  3.73M|    if (width == 4) {
  ------------------
  |  Branch (49:9): [True: 1.78M, False: 1.95M]
  ------------------
   50|  1.78M|      __m128i top = _mm_loadh_epi32((__m128i *)input);
   51|  1.78M|      top = _mm_maddubs_epi16(top, twos);
   52|  1.78M|      __m128i bot = _mm_loadh_epi32((__m128i *)(input + input_stride));
   53|  1.78M|      bot = _mm_maddubs_epi16(bot, twos);
   54|  1.78M|      const __m128i sum = _mm_add_epi16(top, bot);
   55|  1.78M|      _mm_storeh_epi32(pred_buf_m128i, sum);
   56|  1.95M|    } else if (width == 8) {
  ------------------
  |  Branch (56:16): [True: 957k, False: 994k]
  ------------------
   57|   957k|      __m128i top = _mm_loadl_epi64((__m128i *)input);
   58|   957k|      top = _mm_maddubs_epi16(top, twos);
   59|   957k|      __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride));
   60|   957k|      bot = _mm_maddubs_epi16(bot, twos);
   61|   957k|      const __m128i sum = _mm_add_epi16(top, bot);
   62|   957k|      _mm_storel_epi64(pred_buf_m128i, sum);
   63|   994k|    } else {
   64|   994k|      __m128i top = _mm_loadu_si128((__m128i *)input);
   65|   994k|      top = _mm_maddubs_epi16(top, twos);
   66|   994k|      __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride));
   67|   994k|      bot = _mm_maddubs_epi16(bot, twos);
   68|   994k|      const __m128i sum = _mm_add_epi16(top, bot);
   69|   994k|      _mm_storeu_si128(pred_buf_m128i, sum);
   70|   994k|      if (width == 32) {
  ------------------
  |  Branch (70:11): [True: 0, False: 994k]
  ------------------
   71|      0|        __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
   72|      0|        __m128i bot_1 =
   73|      0|            _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1);
   74|      0|        top_1 = _mm_maddubs_epi16(top_1, twos);
   75|      0|        bot_1 = _mm_maddubs_epi16(bot_1, twos);
   76|      0|        __m128i sum_1 = _mm_add_epi16(top_1, bot_1);
   77|      0|        _mm_storeu_si128(pred_buf_m128i + 1, sum_1);
   78|      0|      }
   79|   994k|    }
   80|  3.73M|    input += luma_stride;
   81|  3.73M|    pred_buf_m128i += CFL_BUF_LINE_I128;
  ------------------
  |  |  523|  3.73M|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|  3.73M|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
   82|  3.73M|  } while (pred_buf_m128i < end);
  ------------------
  |  Branch (82:12): [True: 2.82M, False: 914k]
  ------------------
   83|   914k|}
cfl_ssse3.c:_mm_loadh_epi32:
   21|  4.46M|static inline __m128i _mm_loadh_epi32(__m128i const *mem_addr) {
   22|  4.46M|  return _mm_cvtsi32_si128(*((int *)mem_addr));
   23|  4.46M|}
cfl_ssse3.c:_mm_storeh_epi32:
   26|  5.24M|static inline void _mm_storeh_epi32(__m128i const *mem_addr, __m128i a) {
   27|  5.24M|  *((int *)mem_addr) = _mm_cvtsi128_si32(a);
   28|  5.24M|}
cfl_ssse3.c:cfl_luma_subsampling_422_lbd_ssse3:
   98|  1.79k|                                                      int width, int height) {
   99|  1.79k|  const __m128i fours = _mm_set1_epi8(4);
  100|  1.79k|  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
  101|  1.79k|  const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
  ------------------
  |  |  523|  1.79k|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|  1.79k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  102|  12.6k|  do {
  103|  12.6k|    if (width == 4) {
  ------------------
  |  Branch (103:9): [True: 2.67k, False: 9.94k]
  ------------------
  104|  2.67k|      __m128i top = _mm_loadh_epi32((__m128i *)input);
  105|  2.67k|      top = _mm_maddubs_epi16(top, fours);
  106|  2.67k|      _mm_storeh_epi32(pred_buf_m128i, top);
  107|  9.94k|    } else if (width == 8) {
  ------------------
  |  Branch (107:16): [True: 3.56k, False: 6.38k]
  ------------------
  108|  3.56k|      __m128i top = _mm_loadl_epi64((__m128i *)input);
  109|  3.56k|      top = _mm_maddubs_epi16(top, fours);
  110|  3.56k|      _mm_storel_epi64(pred_buf_m128i, top);
  111|  6.38k|    } else {
  112|  6.38k|      __m128i top = _mm_loadu_si128((__m128i *)input);
  113|  6.38k|      top = _mm_maddubs_epi16(top, fours);
  114|  6.38k|      _mm_storeu_si128(pred_buf_m128i, top);
  115|  6.38k|      if (width == 32) {
  ------------------
  |  Branch (115:11): [True: 0, False: 6.38k]
  ------------------
  116|      0|        __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
  117|      0|        top_1 = _mm_maddubs_epi16(top_1, fours);
  118|      0|        _mm_storeu_si128(pred_buf_m128i + 1, top_1);
  119|      0|      }
  120|  6.38k|    }
  121|  12.6k|    input += input_stride;
  122|  12.6k|    pred_buf_m128i += CFL_BUF_LINE_I128;
  ------------------
  |  |  523|  12.6k|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|  12.6k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  123|  12.6k|  } while (pred_buf_m128i < end);
  ------------------
  |  Branch (123:12): [True: 10.8k, False: 1.79k]
  ------------------
  124|  1.79k|}
cfl_ssse3.c:cfl_luma_subsampling_444_lbd_ssse3:
  138|   378k|                                                      int width, int height) {
  139|   378k|  const __m128i zeros = _mm_setzero_si128();
  140|   378k|  const int luma_stride = input_stride;
  141|   378k|  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
  142|   378k|  const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
  ------------------
  |  |  523|   378k|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|   378k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  143|  3.89M|  do {
  144|  3.89M|    if (width == 4) {
  ------------------
  |  Branch (144:9): [True: 884k, False: 3.00M]
  ------------------
  145|   884k|      __m128i row = _mm_loadh_epi32((__m128i *)input);
  146|   884k|      row = _mm_unpacklo_epi8(row, zeros);
  147|   884k|      _mm_storel_epi64(pred_buf_m128i, _mm_slli_epi16(row, 3));
  148|  3.00M|    } else if (width == 8) {
  ------------------
  |  Branch (148:16): [True: 1.66M, False: 1.34M]
  ------------------
  149|  1.66M|      __m128i row = _mm_loadl_epi64((__m128i *)input);
  150|  1.66M|      row = _mm_unpacklo_epi8(row, zeros);
  151|  1.66M|      _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row, 3));
  152|  1.66M|    } else {
  153|  1.34M|      __m128i row = _mm_loadu_si128((__m128i *)input);
  154|  1.34M|      const __m128i row_lo = _mm_unpacklo_epi8(row, zeros);
  155|  1.34M|      const __m128i row_hi = _mm_unpackhi_epi8(row, zeros);
  156|  1.34M|      _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row_lo, 3));
  157|  1.34M|      _mm_storeu_si128(pred_buf_m128i + 1, _mm_slli_epi16(row_hi, 3));
  158|  1.34M|      if (width == 32) {
  ------------------
  |  Branch (158:11): [True: 0, False: 1.34M]
  ------------------
  159|      0|        __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1);
  160|      0|        const __m128i row_1_lo = _mm_unpacklo_epi8(row_1, zeros);
  161|      0|        const __m128i row_1_hi = _mm_unpackhi_epi8(row_1, zeros);
  162|      0|        _mm_storeu_si128(pred_buf_m128i + 2, _mm_slli_epi16(row_1_lo, 3));
  163|      0|        _mm_storeu_si128(pred_buf_m128i + 3, _mm_slli_epi16(row_1_hi, 3));
  164|      0|      }
  165|  1.34M|    }
  166|  3.89M|    input += luma_stride;
  167|  3.89M|    pred_buf_m128i += CFL_BUF_LINE_I128;
  ------------------
  |  |  523|  3.89M|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|  3.89M|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  168|  3.89M|  } while (pred_buf_m128i < end);
  ------------------
  |  Branch (168:12): [True: 3.51M, False: 378k]
  ------------------
  169|   378k|}
cfl_ssse3.c:cfl_luma_subsampling_420_hbd_ssse3:
  185|   850k|                                                      int width, int height) {
  186|   850k|  const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
  ------------------
  |  |  522|   850k|#define CFL_BUF_LINE (32)
  ------------------
  187|   850k|  const int luma_stride = input_stride << 1;
  188|  3.28M|  do {
  189|  3.28M|    if (width == 4) {
  ------------------
  |  Branch (189:9): [True: 1.73M, False: 1.54M]
  ------------------
  190|  1.73M|      const __m128i top = _mm_loadl_epi64((__m128i *)input);
  191|  1.73M|      const __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride));
  192|  1.73M|      __m128i sum = _mm_add_epi16(top, bot);
  193|  1.73M|      sum = _mm_hadd_epi16(sum, sum);
  194|  1.73M|      *((int *)pred_buf_q3) = _mm_cvtsi128_si32(_mm_add_epi16(sum, sum));
  195|  1.73M|    } else {
  196|  1.54M|      const __m128i top = _mm_loadu_si128((__m128i *)input);
  197|  1.54M|      const __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride));
  198|  1.54M|      __m128i sum = _mm_add_epi16(top, bot);
  199|  1.54M|      if (width == 8) {
  ------------------
  |  Branch (199:11): [True: 813k, False: 733k]
  ------------------
  200|   813k|        sum = _mm_hadd_epi16(sum, sum);
  201|   813k|        _mm_storel_epi64((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum));
  202|   813k|      } else {
  203|   733k|        const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
  204|   733k|        const __m128i bot_1 =
  205|   733k|            _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1);
  206|   733k|        sum = _mm_hadd_epi16(sum, _mm_add_epi16(top_1, bot_1));
  207|   733k|        _mm_storeu_si128((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum));
  208|   733k|        if (width == 32) {
  ------------------
  |  Branch (208:13): [True: 0, False: 733k]
  ------------------
  209|      0|          const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2);
  210|      0|          const __m128i bot_2 =
  211|      0|              _mm_loadu_si128(((__m128i *)(input + input_stride)) + 2);
  212|      0|          const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3);
  213|      0|          const __m128i bot_3 =
  214|      0|              _mm_loadu_si128(((__m128i *)(input + input_stride)) + 3);
  215|      0|          const __m128i sum_2 = _mm_add_epi16(top_2, bot_2);
  216|      0|          const __m128i sum_3 = _mm_add_epi16(top_3, bot_3);
  217|      0|          __m128i next_sum = _mm_hadd_epi16(sum_2, sum_3);
  218|      0|          _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1,
  219|      0|                           _mm_add_epi16(next_sum, next_sum));
  220|      0|        }
  221|   733k|      }
  222|  1.54M|    }
  223|  3.28M|    input += luma_stride;
  224|  3.28M|  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
  ------------------
  |  |  522|  3.28M|#define CFL_BUF_LINE (32)
  ------------------
  |  Branch (224:12): [True: 2.43M, False: 850k]
  ------------------
  225|   850k|}
cfl_ssse3.c:cfl_luma_subsampling_422_hbd_ssse3:
  240|  1.76k|                                                      int width, int height) {
  241|  1.76k|  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
  242|  1.76k|  const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
  ------------------
  |  |  523|  1.76k|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|  1.76k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  243|  8.83k|  do {
  244|  8.83k|    if (width == 4) {
  ------------------
  |  Branch (244:9): [True: 4.89k, False: 3.94k]
  ------------------
  245|  4.89k|      const __m128i top = _mm_loadl_epi64((__m128i *)input);
  246|  4.89k|      const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2);
  247|  4.89k|      _mm_storeh_epi32(pred_buf_m128i, sum);
  248|  4.89k|    } else {
  249|  3.94k|      const __m128i top = _mm_loadu_si128((__m128i *)input);
  250|  3.94k|      if (width == 8) {
  ------------------
  |  Branch (250:11): [True: 2.30k, False: 1.64k]
  ------------------
  251|  2.30k|        const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2);
  252|  2.30k|        _mm_storel_epi64(pred_buf_m128i, sum);
  253|  2.30k|      } else {
  254|  1.64k|        const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
  255|  1.64k|        const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top_1), 2);
  256|  1.64k|        _mm_storeu_si128(pred_buf_m128i, sum);
  257|  1.64k|        if (width == 32) {
  ------------------
  |  Branch (257:13): [True: 0, False: 1.64k]
  ------------------
  258|      0|          const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2);
  259|      0|          const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3);
  260|      0|          const __m128i sum_1 = _mm_slli_epi16(_mm_hadd_epi16(top_2, top_3), 2);
  261|      0|          _mm_storeu_si128(pred_buf_m128i + 1, sum_1);
  262|      0|        }
  263|  1.64k|      }
  264|  3.94k|    }
  265|  8.83k|    pred_buf_m128i += CFL_BUF_LINE_I128;
  ------------------
  |  |  523|  8.83k|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|  8.83k|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  266|  8.83k|    input += input_stride;
  267|  8.83k|  } while (pred_buf_m128i < end);
  ------------------
  |  Branch (267:12): [True: 7.07k, False: 1.76k]
  ------------------
  268|  1.76k|}
cfl_ssse3.c:cfl_luma_subsampling_444_hbd_ssse3:
  273|   705k|                                                      int width, int height) {
  274|   705k|  const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
  ------------------
  |  |  522|   705k|#define CFL_BUF_LINE (32)
  ------------------
  275|  6.14M|  do {
  276|  6.14M|    if (width == 4) {
  ------------------
  |  Branch (276:9): [True: 1.40M, False: 4.73M]
  ------------------
  277|  1.40M|      const __m128i row = _mm_slli_epi16(_mm_loadl_epi64((__m128i *)input), 3);
  278|  1.40M|      _mm_storel_epi64((__m128i *)pred_buf_q3, row);
  279|  4.73M|    } else {
  280|  4.73M|      const __m128i row = _mm_slli_epi16(_mm_loadu_si128((__m128i *)input), 3);
  281|  4.73M|      _mm_storeu_si128((__m128i *)pred_buf_q3, row);
  282|  4.73M|      if (width >= 16) {
  ------------------
  |  Branch (282:11): [True: 1.51M, False: 3.21M]
  ------------------
  283|  1.51M|        __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1);
  284|  1.51M|        row_1 = _mm_slli_epi16(row_1, 3);
  285|  1.51M|        _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1, row_1);
  286|  1.51M|        if (width == 32) {
  ------------------
  |  Branch (286:13): [True: 0, False: 1.51M]
  ------------------
  287|      0|          __m128i row_2 = _mm_loadu_si128(((__m128i *)input) + 2);
  288|      0|          row_2 = _mm_slli_epi16(row_2, 3);
  289|      0|          _mm_storeu_si128(((__m128i *)pred_buf_q3) + 2, row_2);
  290|      0|          __m128i row_3 = _mm_loadu_si128(((__m128i *)input) + 3);
  291|      0|          row_3 = _mm_slli_epi16(row_3, 3);
  292|      0|          _mm_storeu_si128(((__m128i *)pred_buf_q3) + 3, row_3);
  293|      0|        }
  294|  1.51M|      }
  295|  4.73M|    }
  296|  6.14M|    input += input_stride;
  297|  6.14M|    pred_buf_q3 += CFL_BUF_LINE;
  ------------------
  |  |  522|  6.14M|#define CFL_BUF_LINE (32)
  ------------------
  298|  6.14M|  } while (pred_buf_q3 < end);
  ------------------
  |  Branch (298:12): [True: 5.43M, False: 705k]
  ------------------
  299|   705k|}
cfl_ssse3.c:cfl_predict_lbd_ssse3:
  315|  1.31M|                                         int alpha_q3, int width, int height) {
  316|  1.31M|  const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
  317|  1.31M|  const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
  318|  1.31M|  const __m128i dc_q0 = _mm_set1_epi16(*dst);
  319|  1.31M|  __m128i *row = (__m128i *)pred_buf_q3;
  320|  1.31M|  const __m128i *row_end = row + height * CFL_BUF_LINE_I128;
  ------------------
  |  |  523|  1.31M|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|  1.31M|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  321|  11.9M|  do {
  322|  11.9M|    __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
  323|  11.9M|    if (width < 16) {
  ------------------
  |  Branch (323:9): [True: 8.22M, False: 3.76M]
  ------------------
  324|  8.22M|      res = _mm_packus_epi16(res, res);
  325|  8.22M|      if (width == 4)
  ------------------
  |  Branch (325:11): [True: 3.44M, False: 4.77M]
  ------------------
  326|  3.44M|        _mm_storeh_epi32((__m128i *)dst, res);
  327|  4.77M|      else
  328|  4.77M|        _mm_storel_epi64((__m128i *)dst, res);
  329|  8.22M|    } else {
  330|  3.76M|      __m128i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
  331|  3.76M|      res = _mm_packus_epi16(res, next);
  332|  3.76M|      _mm_storeu_si128((__m128i *)dst, res);
  333|  3.76M|      if (width == 32) {
  ------------------
  |  Branch (333:11): [True: 0, False: 3.76M]
  ------------------
  334|      0|        res = predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0);
  335|      0|        next = predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0);
  336|      0|        res = _mm_packus_epi16(res, next);
  337|      0|        _mm_storeu_si128((__m128i *)(dst + 16), res);
  338|      0|      }
  339|  3.76M|    }
  340|  11.9M|    dst += dst_stride;
  341|  11.9M|  } while ((row += CFL_BUF_LINE_I128) < row_end);
  ------------------
  |  |  523|  11.9M|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|  11.9M|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  |  Branch (341:12): [True: 10.6M, False: 1.31M]
  ------------------
  342|  1.31M|}
cfl_ssse3.c:predict_unclipped:
  305|  25.1M|                                        __m128i alpha_sign, __m128i dc_q0) {
  306|  25.1M|  __m128i ac_q3 = _mm_loadu_si128(input);
  307|  25.1M|  __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
  308|  25.1M|  __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
  309|  25.1M|  scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
  310|  25.1M|  return _mm_add_epi16(scaled_luma_q0, dc_q0);
  311|  25.1M|}
cfl_ssse3.c:cfl_predict_hbd_ssse3:
  360|  1.03M|                                         int height) {
  361|  1.03M|  const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
  362|  1.03M|  const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
  363|  1.03M|  const __m128i dc_q0 = _mm_set1_epi16(*dst);
  364|  1.03M|  const __m128i max = highbd_max_epi16(bd);
  365|  1.03M|  const __m128i zeros = _mm_setzero_si128();
  366|  1.03M|  __m128i *row = (__m128i *)pred_buf_q3;
  367|  1.03M|  const __m128i *row_end = row + height * CFL_BUF_LINE_I128;
  ------------------
  |  |  523|  1.03M|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|  1.03M|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  368|  9.36M|  do {
  369|  9.36M|    __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
  370|  9.36M|    res = highbd_clamp_epi16(res, zeros, max);
  371|  9.36M|    if (width == 4) {
  ------------------
  |  Branch (371:9): [True: 3.37M, False: 5.98M]
  ------------------
  372|  3.37M|      _mm_storel_epi64((__m128i *)dst, res);
  373|  5.98M|    } else {
  374|  5.98M|      _mm_storeu_si128((__m128i *)dst, res);
  375|  5.98M|    }
  376|  9.36M|    if (width >= 16) {
  ------------------
  |  Branch (376:9): [True: 0, False: 9.36M]
  ------------------
  377|      0|      const __m128i res_1 =
  378|      0|          predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
  379|      0|      _mm_storeu_si128(((__m128i *)dst) + 1,
  380|      0|                       highbd_clamp_epi16(res_1, zeros, max));
  381|      0|    }
  382|  9.36M|    if (width == 32) {
  ------------------
  |  Branch (382:9): [True: 0, False: 9.36M]
  ------------------
  383|      0|      const __m128i res_2 =
  384|      0|          predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0);
  385|      0|      _mm_storeu_si128((__m128i *)(dst + 16),
  386|      0|                       highbd_clamp_epi16(res_2, zeros, max));
  387|      0|      const __m128i res_3 =
  388|      0|          predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0);
  389|      0|      _mm_storeu_si128((__m128i *)(dst + 24),
  390|      0|                       highbd_clamp_epi16(res_3, zeros, max));
  391|      0|    }
  392|  9.36M|    dst += dst_stride;
  393|  9.36M|  } while ((row += CFL_BUF_LINE_I128) < row_end);
  ------------------
  |  |  523|  9.36M|#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
  |  |  ------------------
  |  |  |  |  522|  9.36M|#define CFL_BUF_LINE (32)
  |  |  ------------------
  ------------------
  |  Branch (393:12): [True: 8.32M, False: 1.03M]
  ------------------
  394|  1.03M|}
cfl_ssse3.c:highbd_max_epi16:
  347|  1.03M|static inline __m128i highbd_max_epi16(int bd) {
  348|  1.03M|  const __m128i neg_one = _mm_set1_epi16(-1);
  349|       |  // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
  350|  1.03M|  return _mm_xor_si128(_mm_slli_epi16(neg_one, bd), neg_one);
  351|  1.03M|}
cfl_ssse3.c:highbd_clamp_epi16:
  353|  9.36M|static inline __m128i highbd_clamp_epi16(__m128i u, __m128i zero, __m128i max) {
  354|  9.36M|  return _mm_max_epi16(_mm_min_epi16(u, max), zero);
  355|  9.36M|}

av1_convolve_2d_sr_avx2:
  147|  1.82M|    const int32_t subpel_y_qn, ConvolveParams *conv_params) {
  148|  1.82M|#if CONFIG_SVT_AV1
  149|  1.82M|  const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_qn);
  150|  1.82M|  const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_qn);
  151|       |
  152|  1.82M|  const bool use_general = (tap_x == 12 || tap_y == 12);
  ------------------
  |  Branch (152:29): [True: 108, False: 1.82M]
  |  Branch (152:44): [True: 18.4E, False: 1.82M]
  ------------------
  153|  1.82M|  if (use_general) {
  ------------------
  |  Branch (153:7): [True: 0, False: 1.82M]
  ------------------
  154|      0|    convolve_2d_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
  155|      0|                                filter_params_x, filter_params_y, subpel_x_qn,
  156|      0|                                subpel_y_qn, conv_params);
  157|  1.82M|  } else {
  158|  1.82M|    av1_convolve_2d_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
  159|  1.82M|                                        filter_params_x, filter_params_y,
  160|  1.82M|                                        subpel_x_qn, subpel_y_qn, conv_params);
  161|  1.82M|  }
  162|       |#else
  163|       |  convolve_2d_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
  164|       |                              filter_params_x, filter_params_y, subpel_x_qn,
  165|       |                              subpel_y_qn, conv_params);
  166|       |#endif
  167|  1.82M|}

av1_convolve_y_sr_avx2:
  517|   747k|                            const int32_t subpel_y_qn) {
  518|   747k|#if CONFIG_SVT_AV1
  519|   747k|  const int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
  520|       |
  521|   747k|  if (vert_tap == 12) {
  ------------------
  |  Branch (521:7): [True: 0, False: 747k]
  ------------------
  522|      0|    av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
  523|      0|                                   filter_params_y, subpel_y_qn);
  524|   747k|  } else {
  525|   747k|    av1_convolve_y_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
  526|   747k|                                       filter_params_y, subpel_y_qn);
  527|   747k|  }
  528|       |#else
  529|       |  av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
  530|       |                                 filter_params_y, subpel_y_qn);
  531|       |#endif
  532|   747k|}
av1_convolve_x_sr_avx2:
  912|   828k|                            ConvolveParams *conv_params) {
  913|   828k|#if CONFIG_SVT_AV1
  914|   828k|  const int horz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
  915|       |
  916|   828k|  if (horz_tap == 12) {
  ------------------
  |  Branch (916:7): [True: 0, False: 828k]
  ------------------
  917|      0|    av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
  918|      0|                                   filter_params_x, subpel_x_qn, conv_params);
  919|   828k|  } else {
  920|   828k|    av1_convolve_x_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
  921|   828k|                                       filter_params_x, subpel_x_qn,
  922|   828k|                                       conv_params);
  923|   828k|  }
  924|       |#else
  925|       |  av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
  926|       |                                 filter_params_x, subpel_x_qn, conv_params);
  927|       |#endif
  928|   828k|}

av1_filter_intra_predictor_sse4_1:
  346|   670k|                                       const uint8_t *left, int mode) {
  347|   670k|  const int bw = tx_size_wide[tx_size];
  348|   670k|  const int bh = tx_size_high[tx_size];
  349|   670k|  filter_intra_predictor_sse4_1(dst, stride, above, left, mode, bw, bh);
  350|   670k|}
filterintra_sse4.c:filter_intra_predictor_sse4_1:
  216|   670k|                                                 const int height) {
  217|   670k|  const uint8_t *const top_ptr = (const uint8_t *)top_row;
  218|   670k|  const uint8_t *const left_ptr = (const uint8_t *)left_column;
  219|   670k|  uint8_t *dst = (uint8_t *)dest;
  220|   670k|  if (width == 4) {
  ------------------
  |  Branch (220:7): [True: 147k, False: 523k]
  ------------------
  221|   147k|    filter_4xh(dst, stride, top_ptr, left_ptr, mode, height);
  222|   147k|    return;
  223|   147k|  }
  224|       |
  225|       |  // There is one set of 7 taps for each of the 4x2 output pixels.
  226|   523k|  const __m128i taps_0_1 = xx_load_128(av1_filter_intra_taps[mode][0]);
  227|   523k|  const __m128i taps_2_3 = xx_load_128(av1_filter_intra_taps[mode][2]);
  228|   523k|  const __m128i taps_4_5 = xx_load_128(av1_filter_intra_taps[mode][4]);
  229|   523k|  const __m128i taps_6_7 = xx_load_128(av1_filter_intra_taps[mode][6]);
  230|       |
  231|       |  // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at
  232|       |  // the end is an unused value, which shall be multiplied by 0 when we apply
  233|       |  // the filter.
  234|   523k|  const int64_t kCondenseLeftMask = 0x0F09080403020100;
  235|       |
  236|       |  // Takes the "left section" and puts it right after p0-p4.
  237|   523k|  const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask);
  238|       |
  239|       |  // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last
  240|       |  // byte is unused as above.
  241|   523k|  const int64_t kInsertTopLeftMask = 0x0F0A090302010008;
  242|       |
  243|       |  // Shuffles the "top left" from the left section, to the front. Used when
  244|       |  // grabbing data from left_column and not top_row.
  245|   523k|  const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask);
  246|       |
  247|       |  // This first pass takes care of the cases where the top left pixel comes from
  248|       |  // top_row.
  249|   523k|  __m128i pixels = xx_loadl_64(top_ptr - 1);
  250|   523k|  __m128i left = _mm_slli_si128(xx_loadl_32(left_column), 8);
  251|   523k|  pixels = _mm_or_si128(pixels, left);
  252|       |
  253|       |  // Two sets of the same pixels to multiply with two sets of taps.
  254|   523k|  pixels = _mm_shuffle_epi8(pixels, pixel_order1);
  255|   523k|  filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
  256|   523k|                    &taps_6_7);
  257|   523k|  left = _mm_srli_si128(left, 1);
  258|       |
  259|       |  // Load
  260|   523k|  pixels = xx_loadl_32(dst + stride);
  261|       |
  262|       |  // Because of the above shift, this OR 'invades' the final of the first 8
  263|       |  // bytes of |pixels|. This is acceptable because the 8th filter tap is always
  264|       |  // a padded 0.
  265|   523k|  pixels = _mm_or_si128(pixels, left);
  266|   523k|  pixels = _mm_shuffle_epi8(pixels, pixel_order2);
  267|   523k|  const ptrdiff_t stride2 = stride << 1;
  268|   523k|  const ptrdiff_t stride4 = stride << 2;
  269|   523k|  filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3,
  270|   523k|                    &taps_4_5, &taps_6_7);
  271|   523k|  dst += 4;
  272|  1.96M|  for (int x = 3; x < width - 4; x += 4) {
  ------------------
  |  Branch (272:19): [True: 1.44M, False: 523k]
  ------------------
  273|  1.44M|    pixels = xx_loadl_32(top_ptr + x);
  274|  1.44M|    pixels = _mm_insert_epi8(pixels, (int8_t)top_ptr[x + 4], 4);
  275|  1.44M|    pixels = _mm_insert_epi8(pixels, (int8_t)dst[-1], 5);
  276|  1.44M|    pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride - 1], 6);
  277|       |
  278|       |    // Duplicate bottom half into upper half.
  279|  1.44M|    pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
  280|  1.44M|    filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
  281|  1.44M|                      &taps_6_7);
  282|  1.44M|    pixels = xx_loadl_32(dst + stride - 1);
  283|  1.44M|    pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride + 3], 4);
  284|  1.44M|    pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride2 - 1], 5);
  285|  1.44M|    pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride + stride2 - 1], 6);
  286|       |
  287|       |    // Duplicate bottom half into upper half.
  288|  1.44M|    pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
  289|  1.44M|    filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3,
  290|  1.44M|                      &taps_4_5, &taps_6_7);
  291|  1.44M|    dst += 4;
  292|  1.44M|  }
  293|       |
  294|       |  // Now we handle heights that reference previous blocks rather than top_row.
  295|  1.47M|  for (int y = 4; y < height; y += 4) {
  ------------------
  |  Branch (295:19): [True: 952k, False: 523k]
  ------------------
  296|       |    // Leftmost 4x4 block for this height.
  297|   952k|    dst -= width;
  298|   952k|    dst += stride4;
  299|       |
  300|       |    // Top Left is not available by offset in these leftmost blocks.
  301|   952k|    pixels = xx_loadl_32(dst - stride);
  302|   952k|    left = _mm_slli_si128(xx_loadl_32(left_ptr + y - 1), 8);
  303|   952k|    left = _mm_insert_epi8(left, (int8_t)left_ptr[y + 3], 12);
  304|   952k|    pixels = _mm_or_si128(pixels, left);
  305|   952k|    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
  306|   952k|    filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
  307|   952k|                      &taps_6_7);
  308|       |
  309|       |    // The bytes shifted into positions 6 and 7 will be ignored by the shuffle.
  310|   952k|    left = _mm_srli_si128(left, 2);
  311|   952k|    pixels = xx_loadl_32(dst + stride);
  312|   952k|    pixels = _mm_or_si128(pixels, left);
  313|   952k|    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
  314|   952k|    filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3,
  315|   952k|                      &taps_4_5, &taps_6_7);
  316|       |
  317|   952k|    dst += 4;
  318|       |
  319|       |    // Remaining 4x4 blocks for this height.
  320|  4.19M|    for (int x = 4; x < width; x += 4) {
  ------------------
  |  Branch (320:21): [True: 3.24M, False: 952k]
  ------------------
  321|  3.24M|      pixels = xx_loadl_32(dst - stride - 1);
  322|  3.24M|      pixels = _mm_insert_epi8(pixels, (int8_t)dst[-stride + 3], 4);
  323|  3.24M|      pixels = _mm_insert_epi8(pixels, (int8_t)dst[-1], 5);
  324|  3.24M|      pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride - 1], 6);
  325|       |
  326|       |      // Duplicate bottom half into upper half.
  327|  3.24M|      pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
  328|  3.24M|      filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
  329|  3.24M|                        &taps_6_7);
  330|  3.24M|      pixels = xx_loadl_32(dst + stride - 1);
  331|  3.24M|      pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride + 3], 4);
  332|  3.24M|      pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride2 - 1], 5);
  333|  3.24M|      pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride2 + stride - 1], 6);
  334|       |
  335|       |      // Duplicate bottom half into upper half.
  336|  3.24M|      pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
  337|  3.24M|      filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3,
  338|  3.24M|                        &taps_4_5, &taps_6_7);
  339|  3.24M|      dst += 4;
  340|  3.24M|    }
  341|   952k|  }
  342|   523k|}
filterintra_sse4.c:filter_4xh:
   62|   147k|                              const int height) {
   63|   147k|  const __m128i taps_0_1 = xx_load_128(av1_filter_intra_taps[mode][0]);
   64|   147k|  const __m128i taps_2_3 = xx_load_128(av1_filter_intra_taps[mode][2]);
   65|   147k|  const __m128i taps_4_5 = xx_load_128(av1_filter_intra_taps[mode][4]);
   66|   147k|  const __m128i taps_6_7 = xx_load_128(av1_filter_intra_taps[mode][6]);
   67|   147k|  __m128i top = xx_loadl_32(top_ptr - 1);
   68|   147k|  __m128i pixels = _mm_insert_epi8(top, (int8_t)top_ptr[3], 4);
   69|   147k|  __m128i left = (height == 4 ? xx_loadl_32(left_ptr) : xx_loadl_64(left_ptr));
  ------------------
  |  Branch (69:19): [True: 83.9k, False: 63.1k]
  ------------------
   70|   147k|  left = _mm_slli_si128(left, 5);
   71|       |
   72|       |  // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1],
   73|       |  // left[2], left[3], left[4], left[5], left[6], left[7]
   74|   147k|  pixels = _mm_or_si128(left, pixels);
   75|       |
   76|       |  // Duplicate first 8 bytes.
   77|   147k|  pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
   78|   147k|  filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
   79|   147k|                    &taps_6_7);
   80|   147k|  dest += stride;  // Move to y = 1.
   81|   147k|  pixels = xx_loadl_32(dest);
   82|       |
   83|       |  // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1],
   84|       |  // left[0], left[1], ...
   85|   147k|  pixels = _mm_or_si128(left, pixels);
   86|       |
   87|       |  // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last
   88|       |  // byte is an unused value, which shall be multiplied by 0 when we apply the
   89|       |  // filter.
   90|   147k|  const int64_t kInsertTopLeftFirstMask = 0x0F08070302010006;
   91|       |
   92|       |  // Insert left[-1] in front as TL and put left[0] and left[1] at the end.
   93|   147k|  const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask);
   94|   147k|  pixels = _mm_shuffle_epi8(pixels, pixel_order1);
   95|   147k|  dest += stride;  // Move to y = 2.
   96|   147k|  filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
   97|   147k|                    &taps_6_7);
   98|   147k|  dest += stride;  // Move to y = 3.
   99|       |
  100|       |  // Compute the middle 8 rows before using common code for the final 4 rows.
  101|       |  // Because the common code below this block assumes that
  102|   147k|  if (height == 16) {
  ------------------
  |  Branch (102:7): [True: 29.7k, False: 117k]
  ------------------
  103|       |    // This shift allows us to use pixel_order2 twice after shifting by 2 later.
  104|  29.7k|    left = _mm_slli_si128(left, 1);
  105|  29.7k|    pixels = xx_loadl_32(dest);
  106|       |
  107|       |    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4],
  108|       |    // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3]
  109|  29.7k|    pixels = _mm_or_si128(left, pixels);
  110|       |
  111|       |    // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The
  112|       |    // last byte is an unused value, as above. The top-left was shifted to
  113|       |    // position nine to keep two empty spaces after the top pixels.
  114|  29.7k|    const int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009;
  115|       |
  116|       |    // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at
  117|       |    // the end.
  118|  29.7k|    const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask);
  119|  29.7k|    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
  120|  29.7k|    dest += stride;  // Move to y = 4.
  121|       |
  122|       |    // First 4x2 in the if body.
  123|  29.7k|    filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
  124|  29.7k|                      &taps_6_7);
  125|       |
  126|       |    // Clear all but final pixel in the first 8 of left column.
  127|  29.7k|    __m128i keep_top_left = _mm_srli_si128(left, 13);
  128|  29.7k|    dest += stride;  // Move to y = 5.
  129|  29.7k|    pixels = xx_loadl_32(dest);
  130|  29.7k|    left = _mm_srli_si128(left, 2);
  131|       |
  132|       |    // Relative pixels: top[0], top[1], top[2], top[3], left[-6],
  133|       |    // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1]
  134|  29.7k|    pixels = _mm_or_si128(left, pixels);
  135|  29.7k|    left = xx_loadl_64(left_ptr + 8);
  136|       |
  137|  29.7k|    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
  138|  29.7k|    dest += stride;  // Move to y = 6.
  139|       |
  140|       |    // Second 4x2 in the if body.
  141|  29.7k|    filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
  142|  29.7k|                      &taps_6_7);
  143|       |
  144|       |    // Position TL value so we can use pixel_order1.
  145|  29.7k|    keep_top_left = _mm_slli_si128(keep_top_left, 6);
  146|  29.7k|    dest += stride;  // Move to y = 7.
  147|  29.7k|    pixels = xx_loadl_32(dest);
  148|  29.7k|    left = _mm_slli_si128(left, 7);
  149|  29.7k|    left = _mm_or_si128(left, keep_top_left);
  150|       |
  151|       |    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
  152|       |    // left[-1], left[0], left[1], left[2], left[3], ...
  153|  29.7k|    pixels = _mm_or_si128(left, pixels);
  154|  29.7k|    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
  155|  29.7k|    dest += stride;  // Move to y = 8.
  156|       |
  157|       |    // Third 4x2 in the if body.
  158|  29.7k|    filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
  159|  29.7k|                      &taps_6_7);
  160|  29.7k|    dest += stride;  // Move to y = 9.
  161|       |
  162|       |    // Prepare final inputs.
  163|  29.7k|    pixels = xx_loadl_32(dest);
  164|  29.7k|    left = _mm_srli_si128(left, 2);
  165|       |
  166|       |    // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
  167|       |    // left[-1], left[0], left[1], left[2], left[3], ...
  168|  29.7k|    pixels = _mm_or_si128(left, pixels);
  169|  29.7k|    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
  170|  29.7k|    dest += stride;  // Move to y = 10.
  171|       |
  172|       |    // Fourth 4x2 in the if body.
  173|  29.7k|    filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
  174|  29.7k|                      &taps_6_7);
  175|  29.7k|    dest += stride;  // Move to y = 11.
  176|  29.7k|  }
  177|       |
  178|       |  // In both the 8 and 16 case, we assume that the left vector has the next TL
  179|       |  // at position 8.
  180|   147k|  if (height > 4) {
  ------------------
  |  Branch (180:7): [True: 63.2k, False: 83.9k]
  ------------------
  181|       |    // Erase prior left pixels by shifting TL to position 0.
  182|  63.2k|    left = _mm_srli_si128(left, 8);
  183|  63.2k|    left = _mm_slli_si128(left, 6);
  184|  63.2k|    pixels = xx_loadl_32(dest);
  185|       |
  186|       |    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
  187|       |    // left[-1], left[0], left[1], left[2], left[3], ...
  188|  63.2k|    pixels = _mm_or_si128(left, pixels);
  189|  63.2k|    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
  190|  63.2k|    dest += stride;  // Move to y = 12 or 4.
  191|       |
  192|       |    // First of final two 4x2 blocks.
  193|  63.2k|    filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
  194|  63.2k|                      &taps_6_7);
  195|  63.2k|    dest += stride;  // Move to y = 13 or 5.
  196|  63.2k|    pixels = xx_loadl_32(dest);
  197|  63.2k|    left = _mm_srli_si128(left, 2);
  198|       |
  199|       |    // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
  200|       |    // left[-1], left[0], left[1], left[2], left[3], ...
  201|  63.2k|    pixels = _mm_or_si128(left, pixels);
  202|  63.2k|    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
  203|  63.2k|    dest += stride;  // Move to y = 14 or 6.
  204|       |
  205|       |    // Last of final two 4x2 blocks.
  206|  63.2k|    filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
  207|  63.2k|                      &taps_6_7);
  208|  63.2k|  }
  209|   147k|}
filterintra_sse4.c:filter_4x2_sse4_1:
   36|  12.8M|                                     const __m128i *taps_6_7) {
   37|  12.8M|  const __m128i mul_0_01 = _mm_maddubs_epi16(*pixels, *taps_0_1);
   38|  12.8M|  const __m128i mul_0_23 = _mm_maddubs_epi16(*pixels, *taps_2_3);
   39|       |  // |output_half| contains 8 partial sums.
   40|  12.8M|  __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23);
   41|  12.8M|  __m128i output = _mm_hadd_epi16(output_half, output_half);
   42|  12.8M|  const __m128i output_row0 =
   43|  12.8M|      _mm_packus_epi16(xx_roundn_epi16_unsigned(output, 4),
   44|  12.8M|                       /* arbitrary pack arg */ output);
   45|  12.8M|  xx_storel_32(dst, output_row0);
   46|  12.8M|  const __m128i mul_1_01 = _mm_maddubs_epi16(*pixels, *taps_4_5);
   47|  12.8M|  const __m128i mul_1_23 = _mm_maddubs_epi16(*pixels, *taps_6_7);
   48|  12.8M|  output_half = _mm_hadd_epi16(mul_1_01, mul_1_23);
   49|  12.8M|  output = _mm_hadd_epi16(output_half, output_half);
   50|  12.8M|  const __m128i output_row1 =
   51|  12.8M|      _mm_packus_epi16(xx_roundn_epi16_unsigned(output, 4),
   52|  12.8M|                       /* arbitrary pack arg */ output);
   53|  12.8M|  xx_storel_32(dst + stride, output_row1);
   54|  12.8M|}

av1_highbd_convolve_2d_sr_avx2:
   35|  2.99M|                                    ConvolveParams *conv_params, int bd) {
   36|  2.99M|  if (filter_params_x->taps == 12) {
  ------------------
  |  Branch (36:7): [True: 0, False: 2.99M]
  ------------------
   37|      0|    av1_highbd_convolve_2d_sr_ssse3(src, src_stride, dst, dst_stride, w, h,
   38|      0|                                    filter_params_x, filter_params_y,
   39|      0|                                    subpel_x_qn, subpel_y_qn, conv_params, bd);
   40|      0|    return;
   41|      0|  }
   42|       |
   43|  2.99M|  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
  ------------------
  |  |   19|  2.99M|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
   44|  2.99M|  int im_h = h + filter_params_y->taps - 1;
   45|  2.99M|  int im_stride = 8;
   46|  2.99M|  int i, j;
   47|  2.99M|  const int fo_vert = filter_params_y->taps / 2 - 1;
   48|  2.99M|  const int fo_horiz = filter_params_x->taps / 2 - 1;
   49|  2.99M|  const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
   50|       |
   51|       |  // Check that, even with 12-bit input, the intermediate values will fit
   52|       |  // into an unsigned 16-bit intermediate array.
   53|  2.99M|  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
   54|       |
   55|  2.99M|  __m256i s[8], coeffs_y[4], coeffs_x[4];
   56|       |
   57|  2.99M|  const __m256i round_const_x = _mm256_set1_epi32(
   58|  2.99M|      ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
  ------------------
  |  |   21|  2.99M|#define FILTER_BITS 7
  ------------------
   59|  2.99M|  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
   60|       |
   61|  2.99M|  const __m256i round_const_y = _mm256_set1_epi32(
   62|  2.99M|      ((1 << conv_params->round_1) >> 1) -
   63|  2.99M|      (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
  ------------------
  |  |   21|  2.99M|#define FILTER_BITS 7
  ------------------
   64|  2.99M|  const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
   65|       |
   66|  2.99M|  const int bits =
   67|  2.99M|      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|  2.99M|#define FILTER_BITS 7
  ------------------
   68|  2.99M|  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
   69|  2.99M|  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
   70|  2.99M|  const __m256i clip_pixel =
   71|  2.99M|      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
  ------------------
  |  Branch (71:25): [True: 2.98M, False: 5.49k]
  |  Branch (71:44): [True: 5.48k, False: 14]
  ------------------
   72|  2.99M|  const __m256i zero = _mm256_setzero_si256();
   73|       |
   74|  2.99M|  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
   75|  2.99M|  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
   76|       |
   77|  7.08M|  for (j = 0; j < w; j += 8) {
  ------------------
  |  Branch (77:15): [True: 4.09M, False: 2.99M]
  ------------------
   78|       |    /* Horizontal filter */
   79|  4.09M|    {
   80|  45.7M|      for (i = 0; i < im_h; i += 2) {
  ------------------
  |  Branch (80:19): [True: 41.6M, False: 4.09M]
  ------------------
   81|  41.6M|        const __m256i row0 =
   82|  41.6M|            _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
   83|  41.6M|        __m256i row1 = _mm256_setzero_si256();
   84|  41.6M|        if (i + 1 < im_h)
  ------------------
  |  Branch (84:13): [True: 37.5M, False: 4.10M]
  ------------------
   85|  37.5M|          row1 =
   86|  37.5M|              _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
   87|       |
   88|  41.6M|        const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
   89|  41.6M|        const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
   90|       |
   91|       |        // even pixels
   92|  41.6M|        s[0] = _mm256_alignr_epi8(r1, r0, 0);
   93|  41.6M|        s[1] = _mm256_alignr_epi8(r1, r0, 4);
   94|  41.6M|        s[2] = _mm256_alignr_epi8(r1, r0, 8);
   95|  41.6M|        s[3] = _mm256_alignr_epi8(r1, r0, 12);
   96|       |
   97|  41.6M|        __m256i res_even = convolve(s, coeffs_x);
   98|  41.6M|        res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
   99|  41.6M|                                    round_shift_x);
  100|       |
  101|       |        // odd pixels
  102|  41.6M|        s[0] = _mm256_alignr_epi8(r1, r0, 2);
  103|  41.6M|        s[1] = _mm256_alignr_epi8(r1, r0, 6);
  104|  41.6M|        s[2] = _mm256_alignr_epi8(r1, r0, 10);
  105|  41.6M|        s[3] = _mm256_alignr_epi8(r1, r0, 14);
  106|       |
  107|  41.6M|        __m256i res_odd = convolve(s, coeffs_x);
  108|  41.6M|        res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
  109|  41.6M|                                   round_shift_x);
  110|       |
  111|  41.6M|        __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
  112|  41.6M|        __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
  113|  41.6M|        __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
  114|       |
  115|  41.6M|        _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
  116|  41.6M|      }
  117|  4.09M|    }
  118|       |
  119|       |    /* Vertical filter */
  120|  4.09M|    {
  121|  4.09M|      __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
  122|  4.09M|      __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
  123|  4.09M|      __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
  124|  4.09M|      __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
  125|  4.09M|      __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
  126|  4.09M|      __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
  127|       |
  128|  4.09M|      s[0] = _mm256_unpacklo_epi16(s0, s1);
  129|  4.09M|      s[1] = _mm256_unpacklo_epi16(s2, s3);
  130|  4.09M|      s[2] = _mm256_unpacklo_epi16(s4, s5);
  131|       |
  132|  4.09M|      s[4] = _mm256_unpackhi_epi16(s0, s1);
  133|  4.09M|      s[5] = _mm256_unpackhi_epi16(s2, s3);
  134|  4.09M|      s[6] = _mm256_unpackhi_epi16(s4, s5);
  135|       |
  136|  29.0M|      for (i = 0; i < h; i += 2) {
  ------------------
  |  Branch (136:19): [True: 24.9M, False: 4.09M]
  ------------------
  137|  24.9M|        const int16_t *data = &im_block[i * im_stride];
  138|       |
  139|  24.9M|        const __m256i s6 =
  140|  24.9M|            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
  141|  24.9M|        const __m256i s7 =
  142|  24.9M|            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
  143|       |
  144|  24.9M|        s[3] = _mm256_unpacklo_epi16(s6, s7);
  145|  24.9M|        s[7] = _mm256_unpackhi_epi16(s6, s7);
  146|       |
  147|  24.9M|        const __m256i res_a = convolve(s, coeffs_y);
  148|  24.9M|        __m256i res_a_round = _mm256_sra_epi32(
  149|  24.9M|            _mm256_add_epi32(res_a, round_const_y), round_shift_y);
  150|       |
  151|  24.9M|        res_a_round = _mm256_sra_epi32(
  152|  24.9M|            _mm256_add_epi32(res_a_round, round_const_bits), round_shift_bits);
  153|       |
  154|  24.9M|        if (w - j > 4) {
  ------------------
  |  Branch (154:13): [True: 20.9M, False: 4.05M]
  ------------------
  155|  20.9M|          const __m256i res_b = convolve(s + 4, coeffs_y);
  156|  20.9M|          __m256i res_b_round = _mm256_sra_epi32(
  157|  20.9M|              _mm256_add_epi32(res_b, round_const_y), round_shift_y);
  158|  20.9M|          res_b_round =
  159|  20.9M|              _mm256_sra_epi32(_mm256_add_epi32(res_b_round, round_const_bits),
  160|  20.9M|                               round_shift_bits);
  161|       |
  162|  20.9M|          __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
  163|  20.9M|          res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
  164|  20.9M|          res_16bit = _mm256_max_epi16(res_16bit, zero);
  165|       |
  166|  20.9M|          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
  167|  20.9M|                           _mm256_castsi256_si128(res_16bit));
  168|  20.9M|          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
  169|  20.9M|                           _mm256_extracti128_si256(res_16bit, 1));
  170|  20.9M|        } else if (w == 4) {
  ------------------
  |  Branch (170:20): [True: 3.17M, False: 872k]
  ------------------
  171|  3.17M|          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
  172|  3.17M|          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
  173|  3.17M|          res_a_round = _mm256_max_epi16(res_a_round, zero);
  174|       |
  175|  3.17M|          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
  176|  3.17M|                           _mm256_castsi256_si128(res_a_round));
  177|  3.17M|          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
  178|  3.17M|                           _mm256_extracti128_si256(res_a_round, 1));
  179|  3.17M|        } else {
  180|   872k|          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
  181|   872k|          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
  182|   872k|          res_a_round = _mm256_max_epi16(res_a_round, zero);
  183|       |
  184|   872k|          xx_storel_32(&dst[i * dst_stride + j],
  185|   872k|                       _mm256_castsi256_si128(res_a_round));
  186|   872k|          xx_storel_32(&dst[i * dst_stride + j + dst_stride],
  187|   872k|                       _mm256_extracti128_si256(res_a_round, 1));
  188|   872k|        }
  189|       |
  190|  24.9M|        s[0] = s[1];
  191|  24.9M|        s[1] = s[2];
  192|  24.9M|        s[2] = s[3];
  193|       |
  194|  24.9M|        s[4] = s[5];
  195|  24.9M|        s[5] = s[6];
  196|  24.9M|        s[6] = s[7];
  197|  24.9M|      }
  198|  4.09M|    }
  199|  4.09M|  }
  200|  2.99M|}

av1_highbd_inv_txfm_add_avx2:
 4215|  10.0M|                                  int stride, const TxfmParam *txfm_param) {
 4216|  10.0M|  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
 4217|  10.0M|  const TX_SIZE tx_size = txfm_param->tx_size;
 4218|  10.0M|  switch (tx_size) {
 4219|   496k|    case TX_4X8:
  ------------------
  |  Branch (4219:5): [True: 496k, False: 9.60M]
  ------------------
 4220|  1.43M|    case TX_8X4:
  ------------------
  |  Branch (4220:5): [True: 937k, False: 9.16M]
  ------------------
 4221|  3.80M|    case TX_4X4:
  ------------------
  |  Branch (4221:5): [True: 2.37M, False: 7.72M]
  ------------------
 4222|  4.48M|    case TX_16X4:
  ------------------
  |  Branch (4222:5): [True: 675k, False: 9.42M]
  ------------------
 4223|  4.91M|    case TX_4X16:
  ------------------
  |  Branch (4223:5): [True: 429k, False: 9.66M]
  ------------------
 4224|  4.91M|      av1_highbd_inv_txfm_add_sse4_1(input, dest, stride, txfm_param);
 4225|  4.91M|      break;
 4226|  5.18M|    default:
  ------------------
  |  Branch (4226:5): [True: 5.18M, False: 4.91M]
  ------------------
 4227|  5.18M|      av1_highbd_inv_txfm2d_add_universe_avx2(
 4228|  5.18M|          input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
 4229|  5.18M|          txfm_param->eob, txfm_param->bd);
 4230|  5.18M|      break;
 4231|  10.0M|  }
 4232|  10.0M|}
highbd_inv_txfm_avx2.c:av1_highbd_inv_txfm2d_add_universe_avx2:
 4187|  5.18M|                                                    const int bd) {
 4188|  5.18M|  switch (tx_type) {
 4189|  2.87M|    case DCT_DCT:
  ------------------
  |  Branch (4189:5): [True: 2.87M, False: 2.31M]
  ------------------
 4190|  3.26M|    case ADST_DCT:
  ------------------
  |  Branch (4190:5): [True: 391k, False: 4.79M]
  ------------------
 4191|  3.87M|    case DCT_ADST:
  ------------------
  |  Branch (4191:5): [True: 609k, False: 4.57M]
  ------------------
 4192|  4.31M|    case ADST_ADST:
  ------------------
  |  Branch (4192:5): [True: 443k, False: 4.74M]
  ------------------
 4193|  4.35M|    case FLIPADST_DCT:
  ------------------
  |  Branch (4193:5): [True: 35.7k, False: 5.15M]
  ------------------
 4194|  4.43M|    case DCT_FLIPADST:
  ------------------
  |  Branch (4194:5): [True: 87.8k, False: 5.09M]
  ------------------
 4195|  4.47M|    case FLIPADST_FLIPADST:
  ------------------
  |  Branch (4195:5): [True: 30.9k, False: 5.15M]
  ------------------
 4196|  4.50M|    case ADST_FLIPADST:
  ------------------
  |  Branch (4196:5): [True: 29.6k, False: 5.15M]
  ------------------
 4197|  4.56M|    case FLIPADST_ADST:
  ------------------
  |  Branch (4197:5): [True: 66.6k, False: 5.12M]
  ------------------
 4198|  4.56M|      highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output),
  ------------------
  |  |   75|  4.56M|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 4199|  4.56M|                                             stride, tx_type, tx_size, eob, bd);
 4200|  4.56M|      break;
 4201|   384k|    case IDTX:
  ------------------
  |  Branch (4201:5): [True: 384k, False: 4.80M]
  ------------------
 4202|   528k|    case H_DCT:
  ------------------
  |  Branch (4202:5): [True: 144k, False: 5.04M]
  ------------------
 4203|   549k|    case H_ADST:
  ------------------
  |  Branch (4203:5): [True: 21.0k, False: 5.16M]
  ------------------
 4204|   561k|    case H_FLIPADST:
  ------------------
  |  Branch (4204:5): [True: 11.9k, False: 5.17M]
  ------------------
 4205|   591k|    case V_DCT:
  ------------------
  |  Branch (4205:5): [True: 29.3k, False: 5.15M]
  ------------------
 4206|   615k|    case V_ADST:
  ------------------
  |  Branch (4206:5): [True: 24.1k, False: 5.16M]
  ------------------
 4207|   620k|    case V_FLIPADST:
  ------------------
  |  Branch (4207:5): [True: 5.33k, False: 5.18M]
  ------------------
 4208|   620k|      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, output, stride, tx_type,
 4209|   620k|                                                tx_size, eob, bd);
 4210|   620k|      break;
 4211|      0|    default: assert(0); break;
  ------------------
  |  Branch (4211:5): [True: 0, False: 5.18M]
  ------------------
 4212|  5.18M|  }
 4213|  5.18M|}
highbd_inv_txfm_avx2.c:highbd_inv_txfm2d_add_no_identity_avx2:
 4111|  4.56M|                                                   const int bd) {
 4112|  4.56M|  __m256i buf1[64 * 8];
 4113|  4.56M|  int eobx, eoby;
 4114|  4.56M|  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
 4115|  4.56M|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 4116|  4.56M|  const int txw_idx = get_txw_idx(tx_size);
 4117|  4.56M|  const int txh_idx = get_txh_idx(tx_size);
 4118|  4.56M|  const int txfm_size_col = tx_size_wide[tx_size];
 4119|  4.56M|  const int txfm_size_row = tx_size_high[tx_size];
 4120|  4.56M|  const int buf_size_w_div8 = txfm_size_col >> 3;
 4121|  4.56M|  const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
 4122|  4.56M|  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
 4123|  4.56M|  const int input_stride = AOMMIN(32, txfm_size_row);
  ------------------
  |  |   34|  4.56M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 130k, False: 4.43M]
  |  |  ------------------
  ------------------
 4124|  4.56M|  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
 4125|  4.56M|  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
 4126|  4.56M|  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
 4127|  4.56M|  const transform_1d_avx2 row_txfm =
 4128|  4.56M|      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
 4129|  4.56M|  const transform_1d_avx2 col_txfm =
 4130|  4.56M|      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
 4131|       |
 4132|  4.56M|  assert(col_txfm != NULL);
 4133|  4.56M|  assert(row_txfm != NULL);
 4134|  4.56M|  int ud_flip, lr_flip;
 4135|  4.56M|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 4136|       |
 4137|       |  // 1st stage: column transform
 4138|  9.72M|  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
  ------------------
  |  Branch (4138:19): [True: 5.15M, False: 4.56M]
  ------------------
 4139|  5.15M|    __m256i buf0[64];
 4140|  5.15M|    load_buffer_32bit_input(input + i * 8, input_stride, buf0,
 4141|  5.15M|                            buf_size_nonzero_w);
 4142|  5.15M|    if (rect_type == 1 || rect_type == -1) {
  ------------------
  |  Branch (4142:9): [True: 742k, False: 4.41M]
  |  Branch (4142:27): [True: 482k, False: 3.92M]
  ------------------
 4143|  1.22M|      round_shift_rect_array_32_avx2(buf0, buf0, buf_size_nonzero_w, 0,
 4144|  1.22M|                                     NewInvSqrt2);
 4145|  1.22M|    }
 4146|  5.15M|    row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
  ------------------
  |  |   43|  5.15M|#define INV_COS_BIT 12
  ------------------
 4147|       |
 4148|  5.15M|    __m256i *_buf1 = buf1 + i * 8;
 4149|  5.15M|    if (lr_flip) {
  ------------------
  |  Branch (4149:9): [True: 157k, False: 4.99M]
  ------------------
 4150|   381k|      for (int j = 0; j < buf_size_w_div8; ++j) {
  ------------------
  |  Branch (4150:23): [True: 224k, False: 157k]
  ------------------
 4151|   224k|        transpose_8x8_flip_avx2(
 4152|   224k|            &buf0[j * 8], &_buf1[(buf_size_w_div8 - 1 - j) * txfm_size_row]);
 4153|   224k|      }
 4154|  4.99M|    } else {
 4155|  18.5M|      for (int j = 0; j < buf_size_w_div8; ++j) {
  ------------------
  |  Branch (4155:23): [True: 13.5M, False: 4.99M]
  ------------------
 4156|  13.5M|        transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]);
 4157|  13.5M|      }
 4158|  4.99M|    }
 4159|  5.15M|  }
 4160|       |  // 2nd stage: column transform
 4161|  15.9M|  for (int i = 0; i < buf_size_w_div8; i++) {
  ------------------
  |  Branch (4161:19): [True: 11.4M, False: 4.56M]
  ------------------
 4162|  11.4M|    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
  ------------------
  |  |   43|  11.4M|#define INV_COS_BIT 12
  ------------------
 4163|  11.4M|             bd, 0);
 4164|       |
 4165|  11.4M|    round_shift_array_32_avx2(buf1 + i * txfm_size_row,
 4166|  11.4M|                              buf1 + i * txfm_size_row, txfm_size_row,
 4167|  11.4M|                              -shift[1]);
 4168|  11.4M|  }
 4169|       |
 4170|       |  // write to buffer
 4171|  4.56M|  if (txfm_size_col >= 16) {
  ------------------
  |  Branch (4171:7): [True: 2.87M, False: 1.69M]
  ------------------
 4172|  7.76M|    for (int i = 0; i < (txfm_size_col >> 4); i++) {
  ------------------
  |  Branch (4172:21): [True: 4.89M, False: 2.87M]
  ------------------
 4173|  4.89M|      highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2,
 4174|  4.89M|                                    output + 16 * i, stride, ud_flip,
 4175|  4.89M|                                    txfm_size_row, bd);
 4176|  4.89M|    }
 4177|  2.87M|  } else if (txfm_size_col == 8) {
  ------------------
  |  Branch (4177:14): [True: 1.68M, False: 1.43k]
  ------------------
 4178|  1.68M|    highbd_write_buffer_8xn_avx2(buf1, output, stride, ud_flip, txfm_size_row,
 4179|  1.68M|                                 bd);
 4180|  1.68M|  }
 4181|  4.56M|}
highbd_inv_txfm_avx2.c:idct8x8_low1_avx2:
 2414|  1.54M|                              int bd, int out_shift) {
 2415|  1.54M|  const int32_t *cospi = cospi_arr(bit);
 2416|  1.54M|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
 2417|  1.54M|  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
 2418|  1.54M|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|  3.09M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 1.54M]
  |  |  |  Branch (35:31): [True: 1.17M, False: 375k]
  |  |  |  Branch (35:44): [True: 1.17M, False: 375k]
  |  |  ------------------
  ------------------
 2419|  1.54M|  __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
 2420|  1.54M|  __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
 2421|  1.54M|  __m256i x;
 2422|       |
 2423|       |  // stage 0
 2424|       |  // stage 1
 2425|       |  // stage 2
 2426|       |  // stage 3
 2427|  1.54M|  x = _mm256_mullo_epi32(in[0], cospi32);
 2428|  1.54M|  x = _mm256_add_epi32(x, rnding);
 2429|  1.54M|  x = _mm256_srai_epi32(x, bit);
 2430|       |
 2431|       |  // stage 4
 2432|       |  // stage 5
 2433|  1.54M|  if (!do_cols) {
  ------------------
  |  Branch (2433:7): [True: 375k, False: 1.17M]
  ------------------
 2434|   375k|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|   375k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 375k]
  |  |  ------------------
  ------------------
 2435|   375k|    __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
 2436|   375k|    clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
 2437|   375k|    clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
 2438|   375k|    x = _mm256_add_epi32(x, offset);
 2439|   375k|    x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
 2440|   375k|  }
 2441|  1.54M|  x = _mm256_max_epi32(x, clamp_lo);
 2442|  1.54M|  x = _mm256_min_epi32(x, clamp_hi);
 2443|  1.54M|  out[0] = x;
 2444|  1.54M|  out[1] = x;
 2445|  1.54M|  out[2] = x;
 2446|  1.54M|  out[3] = x;
 2447|  1.54M|  out[4] = x;
 2448|  1.54M|  out[5] = x;
 2449|  1.54M|  out[6] = x;
 2450|  1.54M|  out[7] = x;
 2451|  1.54M|}
highbd_inv_txfm_avx2.c:idct8x8_avx2:
 2453|  3.08M|                         int bd, int out_shift) {
 2454|  3.08M|  const int32_t *cospi = cospi_arr(bit);
 2455|  3.08M|  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
 2456|  3.08M|  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
 2457|  3.08M|  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
 2458|  3.08M|  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
 2459|  3.08M|  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
 2460|  3.08M|  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
 2461|  3.08M|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
 2462|  3.08M|  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
 2463|  3.08M|  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
 2464|  3.08M|  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
 2465|  3.08M|  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
 2466|  3.08M|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|  6.16M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 3.08M]
  |  |  |  Branch (35:31): [True: 2.39M, False: 684k]
  |  |  |  Branch (35:44): [True: 2.39M, False: 684k]
  |  |  ------------------
  ------------------
 2467|  3.08M|  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
 2468|  3.08M|  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
 2469|  3.08M|  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
 2470|  3.08M|  __m256i v0, v1, v2, v3, v4, v5, v6, v7;
 2471|  3.08M|  __m256i x, y;
 2472|       |
 2473|       |  // stage 0
 2474|       |  // stage 1
 2475|       |  // stage 2
 2476|  3.08M|  u0 = in[0];
 2477|  3.08M|  u1 = in[4];
 2478|  3.08M|  u2 = in[2];
 2479|  3.08M|  u3 = in[6];
 2480|       |
 2481|  3.08M|  x = _mm256_mullo_epi32(in[1], cospi56);
 2482|  3.08M|  y = _mm256_mullo_epi32(in[7], cospim8);
 2483|  3.08M|  u4 = _mm256_add_epi32(x, y);
 2484|  3.08M|  u4 = _mm256_add_epi32(u4, rnding);
 2485|  3.08M|  u4 = _mm256_srai_epi32(u4, bit);
 2486|       |
 2487|  3.08M|  x = _mm256_mullo_epi32(in[1], cospi8);
 2488|  3.08M|  y = _mm256_mullo_epi32(in[7], cospi56);
 2489|  3.08M|  u7 = _mm256_add_epi32(x, y);
 2490|  3.08M|  u7 = _mm256_add_epi32(u7, rnding);
 2491|  3.08M|  u7 = _mm256_srai_epi32(u7, bit);
 2492|       |
 2493|  3.08M|  x = _mm256_mullo_epi32(in[5], cospi24);
 2494|  3.08M|  y = _mm256_mullo_epi32(in[3], cospim40);
 2495|  3.08M|  u5 = _mm256_add_epi32(x, y);
 2496|  3.08M|  u5 = _mm256_add_epi32(u5, rnding);
 2497|  3.08M|  u5 = _mm256_srai_epi32(u5, bit);
 2498|       |
 2499|  3.08M|  x = _mm256_mullo_epi32(in[5], cospi40);
 2500|  3.08M|  y = _mm256_mullo_epi32(in[3], cospi24);
 2501|  3.08M|  u6 = _mm256_add_epi32(x, y);
 2502|  3.08M|  u6 = _mm256_add_epi32(u6, rnding);
 2503|  3.08M|  u6 = _mm256_srai_epi32(u6, bit);
 2504|       |
 2505|       |  // stage 3
 2506|  3.08M|  x = _mm256_mullo_epi32(u0, cospi32);
 2507|  3.08M|  y = _mm256_mullo_epi32(u1, cospi32);
 2508|  3.08M|  v0 = _mm256_add_epi32(x, y);
 2509|  3.08M|  v0 = _mm256_add_epi32(v0, rnding);
 2510|  3.08M|  v0 = _mm256_srai_epi32(v0, bit);
 2511|       |
 2512|  3.08M|  v1 = _mm256_sub_epi32(x, y);
 2513|  3.08M|  v1 = _mm256_add_epi32(v1, rnding);
 2514|  3.08M|  v1 = _mm256_srai_epi32(v1, bit);
 2515|       |
 2516|  3.08M|  x = _mm256_mullo_epi32(u2, cospi48);
 2517|  3.08M|  y = _mm256_mullo_epi32(u3, cospim16);
 2518|  3.08M|  v2 = _mm256_add_epi32(x, y);
 2519|  3.08M|  v2 = _mm256_add_epi32(v2, rnding);
 2520|  3.08M|  v2 = _mm256_srai_epi32(v2, bit);
 2521|       |
 2522|  3.08M|  x = _mm256_mullo_epi32(u2, cospi16);
 2523|  3.08M|  y = _mm256_mullo_epi32(u3, cospi48);
 2524|  3.08M|  v3 = _mm256_add_epi32(x, y);
 2525|  3.08M|  v3 = _mm256_add_epi32(v3, rnding);
 2526|  3.08M|  v3 = _mm256_srai_epi32(v3, bit);
 2527|       |
 2528|  3.08M|  addsub_avx2(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
 2529|  3.08M|  addsub_avx2(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
 2530|       |
 2531|       |  // stage 4
 2532|  3.08M|  addsub_avx2(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
 2533|  3.08M|  addsub_avx2(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
 2534|  3.08M|  u4 = v4;
 2535|  3.08M|  u7 = v7;
 2536|       |
 2537|  3.08M|  x = _mm256_mullo_epi32(v5, cospi32);
 2538|  3.08M|  y = _mm256_mullo_epi32(v6, cospi32);
 2539|  3.08M|  u6 = _mm256_add_epi32(y, x);
 2540|  3.08M|  u6 = _mm256_add_epi32(u6, rnding);
 2541|  3.08M|  u6 = _mm256_srai_epi32(u6, bit);
 2542|       |
 2543|  3.08M|  u5 = _mm256_sub_epi32(y, x);
 2544|  3.08M|  u5 = _mm256_add_epi32(u5, rnding);
 2545|  3.08M|  u5 = _mm256_srai_epi32(u5, bit);
 2546|       |
 2547|  3.08M|  addsub_avx2(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi);
 2548|  3.08M|  addsub_avx2(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi);
 2549|  3.08M|  addsub_avx2(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi);
 2550|  3.08M|  addsub_avx2(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi);
 2551|       |  // stage 5
 2552|  3.08M|  if (!do_cols) {
  ------------------
  |  Branch (2552:7): [True: 685k, False: 2.39M]
  ------------------
 2553|   685k|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|   685k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 685k]
  |  |  ------------------
  ------------------
 2554|   685k|    const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
 2555|   685k|    const __m256i clamp_hi_out =
 2556|   685k|        _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
 2557|       |
 2558|   685k|    round_shift_4x4_avx2(out, out_shift);
 2559|   685k|    round_shift_4x4_avx2(out + 4, out_shift);
 2560|   685k|    highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 8);
 2561|   685k|  }
 2562|  3.08M|}
highbd_inv_txfm_avx2.c:addsub_avx2:
  265|   367M|                        const __m256i *clamp_hi) {
  266|   367M|  __m256i a0 = _mm256_add_epi32(in0, in1);
  267|   367M|  __m256i a1 = _mm256_sub_epi32(in0, in1);
  268|       |
  269|   367M|  a0 = _mm256_max_epi32(a0, *clamp_lo);
  270|   367M|  a0 = _mm256_min_epi32(a0, *clamp_hi);
  271|   367M|  a1 = _mm256_max_epi32(a1, *clamp_lo);
  272|   367M|  a1 = _mm256_min_epi32(a1, *clamp_hi);
  273|       |
  274|   367M|  *out0 = a0;
  275|   367M|  *out1 = a1;
  276|   367M|}
highbd_inv_txfm_avx2.c:round_shift_4x4_avx2:
   50|  17.2M|static inline void round_shift_4x4_avx2(__m256i *in, int shift) {
   51|  17.2M|  if (shift != 0) {
  ------------------
  |  Branch (51:7): [True: 17.2M, False: 18.4E]
  ------------------
   52|  17.2M|    __m256i rnding = _mm256_set1_epi32(1 << (shift - 1));
   53|  17.2M|    in[0] = _mm256_add_epi32(in[0], rnding);
   54|  17.2M|    in[1] = _mm256_add_epi32(in[1], rnding);
   55|  17.2M|    in[2] = _mm256_add_epi32(in[2], rnding);
   56|  17.2M|    in[3] = _mm256_add_epi32(in[3], rnding);
   57|       |
   58|  17.2M|    in[0] = _mm256_srai_epi32(in[0], shift);
   59|  17.2M|    in[1] = _mm256_srai_epi32(in[1], shift);
   60|  17.2M|    in[2] = _mm256_srai_epi32(in[2], shift);
   61|  17.2M|    in[3] = _mm256_srai_epi32(in[3], shift);
   62|  17.2M|  }
   63|  17.2M|}
highbd_inv_txfm_avx2.c:highbd_clamp_epi32_avx2:
   74|  2.63M|                                    const __m256i *clamp_hi, int size) {
   75|  2.63M|  __m256i a0, a1;
   76|  19.8M|  for (int i = 0; i < size; i += 4) {
  ------------------
  |  Branch (76:19): [True: 17.2M, False: 2.63M]
  ------------------
   77|  17.2M|    a0 = _mm256_max_epi32(in[i], *clamp_lo);
   78|  17.2M|    out[i] = _mm256_min_epi32(a0, *clamp_hi);
   79|       |
   80|  17.2M|    a1 = _mm256_max_epi32(in[i + 1], *clamp_lo);
   81|  17.2M|    out[i + 1] = _mm256_min_epi32(a1, *clamp_hi);
   82|       |
   83|  17.2M|    a0 = _mm256_max_epi32(in[i + 2], *clamp_lo);
   84|  17.2M|    out[i + 2] = _mm256_min_epi32(a0, *clamp_hi);
   85|       |
   86|  17.2M|    a1 = _mm256_max_epi32(in[i + 3], *clamp_lo);
   87|  17.2M|    out[i + 3] = _mm256_min_epi32(a1, *clamp_hi);
   88|  17.2M|  }
   89|  2.63M|}
highbd_inv_txfm_avx2.c:iadst8x8_low1_avx2:
 2564|   496k|                               int bd, int out_shift) {
 2565|   496k|  const int32_t *cospi = cospi_arr(bit);
 2566|   496k|  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
 2567|   496k|  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
 2568|   496k|  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
 2569|   496k|  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
 2570|   496k|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
 2571|   496k|  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
 2572|   496k|  const __m256i kZero = _mm256_setzero_si256();
 2573|   496k|  __m256i u[8], x;
 2574|       |
 2575|       |  // stage 0
 2576|       |  // stage 1
 2577|       |  // stage 2
 2578|       |
 2579|   496k|  x = _mm256_mullo_epi32(in[0], cospi60);
 2580|   496k|  u[0] = _mm256_add_epi32(x, rnding);
 2581|   496k|  u[0] = _mm256_srai_epi32(u[0], bit);
 2582|       |
 2583|   496k|  x = _mm256_mullo_epi32(in[0], cospi4);
 2584|   496k|  u[1] = _mm256_sub_epi32(kZero, x);
 2585|   496k|  u[1] = _mm256_add_epi32(u[1], rnding);
 2586|   496k|  u[1] = _mm256_srai_epi32(u[1], bit);
 2587|       |
 2588|       |  // stage 3
 2589|       |  // stage 4
 2590|   496k|  __m256i temp1, temp2;
 2591|   496k|  temp1 = _mm256_mullo_epi32(u[0], cospi16);
 2592|   496k|  x = _mm256_mullo_epi32(u[1], cospi48);
 2593|   496k|  temp1 = _mm256_add_epi32(temp1, x);
 2594|   496k|  temp1 = _mm256_add_epi32(temp1, rnding);
 2595|   496k|  temp1 = _mm256_srai_epi32(temp1, bit);
 2596|   496k|  u[4] = temp1;
 2597|       |
 2598|   496k|  temp2 = _mm256_mullo_epi32(u[0], cospi48);
 2599|   496k|  x = _mm256_mullo_epi32(u[1], cospi16);
 2600|   496k|  u[5] = _mm256_sub_epi32(temp2, x);
 2601|   496k|  u[5] = _mm256_add_epi32(u[5], rnding);
 2602|   496k|  u[5] = _mm256_srai_epi32(u[5], bit);
 2603|       |
 2604|       |  // stage 5
 2605|       |  // stage 6
 2606|   496k|  temp1 = _mm256_mullo_epi32(u[0], cospi32);
 2607|   496k|  x = _mm256_mullo_epi32(u[1], cospi32);
 2608|   496k|  u[2] = _mm256_add_epi32(temp1, x);
 2609|   496k|  u[2] = _mm256_add_epi32(u[2], rnding);
 2610|   496k|  u[2] = _mm256_srai_epi32(u[2], bit);
 2611|       |
 2612|   496k|  u[3] = _mm256_sub_epi32(temp1, x);
 2613|   496k|  u[3] = _mm256_add_epi32(u[3], rnding);
 2614|   496k|  u[3] = _mm256_srai_epi32(u[3], bit);
 2615|       |
 2616|   496k|  temp1 = _mm256_mullo_epi32(u[4], cospi32);
 2617|   496k|  x = _mm256_mullo_epi32(u[5], cospi32);
 2618|   496k|  u[6] = _mm256_add_epi32(temp1, x);
 2619|   496k|  u[6] = _mm256_add_epi32(u[6], rnding);
 2620|   496k|  u[6] = _mm256_srai_epi32(u[6], bit);
 2621|       |
 2622|   496k|  u[7] = _mm256_sub_epi32(temp1, x);
 2623|   496k|  u[7] = _mm256_add_epi32(u[7], rnding);
 2624|   496k|  u[7] = _mm256_srai_epi32(u[7], bit);
 2625|       |
 2626|       |  // stage 7
 2627|   496k|  if (do_cols) {
  ------------------
  |  Branch (2627:7): [True: 251k, False: 245k]
  ------------------
 2628|   251k|    out[0] = u[0];
 2629|   251k|    out[1] = _mm256_sub_epi32(kZero, u[4]);
 2630|   251k|    out[2] = u[6];
 2631|   251k|    out[3] = _mm256_sub_epi32(kZero, u[2]);
 2632|   251k|    out[4] = u[3];
 2633|   251k|    out[5] = _mm256_sub_epi32(kZero, u[7]);
 2634|   251k|    out[6] = u[5];
 2635|   251k|    out[7] = _mm256_sub_epi32(kZero, u[1]);
 2636|   251k|  } else {
 2637|   245k|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|   245k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 245k]
  |  |  ------------------
  ------------------
 2638|   245k|    const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
 2639|   245k|    const __m256i clamp_hi_out =
 2640|   245k|        _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
 2641|       |
 2642|   245k|    neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
 2643|   245k|                   out_shift);
 2644|   245k|    neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
 2645|   245k|                   out_shift);
 2646|   245k|    neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
 2647|   245k|                   out_shift);
 2648|   245k|    neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
 2649|   245k|                   out_shift);
 2650|   245k|  }
 2651|   496k|}
highbd_inv_txfm_avx2.c:neg_shift_avx2:
  143|  8.10M|                           const __m256i *clamp_hi, int shift) {
  144|  8.10M|  __m256i offset = _mm256_set1_epi32((1 << shift) >> 1);
  145|  8.10M|  __m256i a0 = _mm256_add_epi32(offset, in0);
  146|  8.10M|  __m256i a1 = _mm256_sub_epi32(offset, in1);
  147|       |
  148|  8.10M|  a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
  149|  8.10M|  a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
  150|       |
  151|  8.10M|  a0 = _mm256_max_epi32(a0, *clamp_lo);
  152|  8.10M|  a0 = _mm256_min_epi32(a0, *clamp_hi);
  153|  8.10M|  a1 = _mm256_max_epi32(a1, *clamp_lo);
  154|  8.10M|  a1 = _mm256_min_epi32(a1, *clamp_hi);
  155|       |
  156|  8.10M|  *out0 = a0;
  157|  8.10M|  *out1 = a1;
  158|  8.10M|}
highbd_inv_txfm_avx2.c:iadst8x8_avx2:
 2654|  1.02M|                          int bd, int out_shift) {
 2655|  1.02M|  const int32_t *cospi = cospi_arr(bit);
 2656|  1.02M|  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
 2657|  1.02M|  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
 2658|  1.02M|  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
 2659|  1.02M|  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
 2660|  1.02M|  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
 2661|  1.02M|  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
 2662|  1.02M|  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
 2663|  1.02M|  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
 2664|  1.02M|  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
 2665|  1.02M|  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
 2666|  1.02M|  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
 2667|  1.02M|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
 2668|  1.02M|  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
 2669|  1.02M|  const __m256i kZero = _mm256_setzero_si256();
 2670|  1.02M|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|  2.05M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 1.02M]
  |  |  |  Branch (35:31): [True: 570k, False: 455k]
  |  |  |  Branch (35:44): [True: 570k, False: 455k]
  |  |  ------------------
  ------------------
 2671|  1.02M|  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
 2672|  1.02M|  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
 2673|  1.02M|  __m256i u[8], v[8], x;
 2674|       |
 2675|       |  // stage 0
 2676|       |  // stage 1
 2677|       |  // stage 2
 2678|       |
 2679|  1.02M|  u[0] = _mm256_mullo_epi32(in[7], cospi4);
 2680|  1.02M|  x = _mm256_mullo_epi32(in[0], cospi60);
 2681|  1.02M|  u[0] = _mm256_add_epi32(u[0], x);
 2682|  1.02M|  u[0] = _mm256_add_epi32(u[0], rnding);
 2683|  1.02M|  u[0] = _mm256_srai_epi32(u[0], bit);
 2684|       |
 2685|  1.02M|  u[1] = _mm256_mullo_epi32(in[7], cospi60);
 2686|  1.02M|  x = _mm256_mullo_epi32(in[0], cospi4);
 2687|  1.02M|  u[1] = _mm256_sub_epi32(u[1], x);
 2688|  1.02M|  u[1] = _mm256_add_epi32(u[1], rnding);
 2689|  1.02M|  u[1] = _mm256_srai_epi32(u[1], bit);
 2690|       |
 2691|  1.02M|  u[2] = _mm256_mullo_epi32(in[5], cospi20);
 2692|  1.02M|  x = _mm256_mullo_epi32(in[2], cospi44);
 2693|  1.02M|  u[2] = _mm256_add_epi32(u[2], x);
 2694|  1.02M|  u[2] = _mm256_add_epi32(u[2], rnding);
 2695|  1.02M|  u[2] = _mm256_srai_epi32(u[2], bit);
 2696|       |
 2697|  1.02M|  u[3] = _mm256_mullo_epi32(in[5], cospi44);
 2698|  1.02M|  x = _mm256_mullo_epi32(in[2], cospi20);
 2699|  1.02M|  u[3] = _mm256_sub_epi32(u[3], x);
 2700|  1.02M|  u[3] = _mm256_add_epi32(u[3], rnding);
 2701|  1.02M|  u[3] = _mm256_srai_epi32(u[3], bit);
 2702|       |
 2703|  1.02M|  u[4] = _mm256_mullo_epi32(in[3], cospi36);
 2704|  1.02M|  x = _mm256_mullo_epi32(in[4], cospi28);
 2705|  1.02M|  u[4] = _mm256_add_epi32(u[4], x);
 2706|  1.02M|  u[4] = _mm256_add_epi32(u[4], rnding);
 2707|  1.02M|  u[4] = _mm256_srai_epi32(u[4], bit);
 2708|       |
 2709|  1.02M|  u[5] = _mm256_mullo_epi32(in[3], cospi28);
 2710|  1.02M|  x = _mm256_mullo_epi32(in[4], cospi36);
 2711|  1.02M|  u[5] = _mm256_sub_epi32(u[5], x);
 2712|  1.02M|  u[5] = _mm256_add_epi32(u[5], rnding);
 2713|  1.02M|  u[5] = _mm256_srai_epi32(u[5], bit);
 2714|       |
 2715|  1.02M|  u[6] = _mm256_mullo_epi32(in[1], cospi52);
 2716|  1.02M|  x = _mm256_mullo_epi32(in[6], cospi12);
 2717|  1.02M|  u[6] = _mm256_add_epi32(u[6], x);
 2718|  1.02M|  u[6] = _mm256_add_epi32(u[6], rnding);
 2719|  1.02M|  u[6] = _mm256_srai_epi32(u[6], bit);
 2720|       |
 2721|  1.02M|  u[7] = _mm256_mullo_epi32(in[1], cospi12);
 2722|  1.02M|  x = _mm256_mullo_epi32(in[6], cospi52);
 2723|  1.02M|  u[7] = _mm256_sub_epi32(u[7], x);
 2724|  1.02M|  u[7] = _mm256_add_epi32(u[7], rnding);
 2725|  1.02M|  u[7] = _mm256_srai_epi32(u[7], bit);
 2726|       |
 2727|       |  // stage 3
 2728|  1.02M|  addsub_avx2(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
 2729|  1.02M|  addsub_avx2(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
 2730|  1.02M|  addsub_avx2(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
 2731|  1.02M|  addsub_avx2(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
 2732|       |
 2733|       |  // stage 4
 2734|  1.02M|  u[0] = v[0];
 2735|  1.02M|  u[1] = v[1];
 2736|  1.02M|  u[2] = v[2];
 2737|  1.02M|  u[3] = v[3];
 2738|       |
 2739|  1.02M|  u[4] = _mm256_mullo_epi32(v[4], cospi16);
 2740|  1.02M|  x = _mm256_mullo_epi32(v[5], cospi48);
 2741|  1.02M|  u[4] = _mm256_add_epi32(u[4], x);
 2742|  1.02M|  u[4] = _mm256_add_epi32(u[4], rnding);
 2743|  1.02M|  u[4] = _mm256_srai_epi32(u[4], bit);
 2744|       |
 2745|  1.02M|  u[5] = _mm256_mullo_epi32(v[4], cospi48);
 2746|  1.02M|  x = _mm256_mullo_epi32(v[5], cospi16);
 2747|  1.02M|  u[5] = _mm256_sub_epi32(u[5], x);
 2748|  1.02M|  u[5] = _mm256_add_epi32(u[5], rnding);
 2749|  1.02M|  u[5] = _mm256_srai_epi32(u[5], bit);
 2750|       |
 2751|  1.02M|  u[6] = _mm256_mullo_epi32(v[6], cospim48);
 2752|  1.02M|  x = _mm256_mullo_epi32(v[7], cospi16);
 2753|  1.02M|  u[6] = _mm256_add_epi32(u[6], x);
 2754|  1.02M|  u[6] = _mm256_add_epi32(u[6], rnding);
 2755|  1.02M|  u[6] = _mm256_srai_epi32(u[6], bit);
 2756|       |
 2757|  1.02M|  u[7] = _mm256_mullo_epi32(v[6], cospi16);
 2758|  1.02M|  x = _mm256_mullo_epi32(v[7], cospim48);
 2759|  1.02M|  u[7] = _mm256_sub_epi32(u[7], x);
 2760|  1.02M|  u[7] = _mm256_add_epi32(u[7], rnding);
 2761|  1.02M|  u[7] = _mm256_srai_epi32(u[7], bit);
 2762|       |
 2763|       |  // stage 5
 2764|  1.02M|  addsub_avx2(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
 2765|  1.02M|  addsub_avx2(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
 2766|  1.02M|  addsub_avx2(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
 2767|  1.02M|  addsub_avx2(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
 2768|       |
 2769|       |  // stage 6
 2770|  1.02M|  u[0] = v[0];
 2771|  1.02M|  u[1] = v[1];
 2772|  1.02M|  u[4] = v[4];
 2773|  1.02M|  u[5] = v[5];
 2774|       |
 2775|  1.02M|  v[0] = _mm256_mullo_epi32(v[2], cospi32);
 2776|  1.02M|  x = _mm256_mullo_epi32(v[3], cospi32);
 2777|  1.02M|  u[2] = _mm256_add_epi32(v[0], x);
 2778|  1.02M|  u[2] = _mm256_add_epi32(u[2], rnding);
 2779|  1.02M|  u[2] = _mm256_srai_epi32(u[2], bit);
 2780|       |
 2781|  1.02M|  u[3] = _mm256_sub_epi32(v[0], x);
 2782|  1.02M|  u[3] = _mm256_add_epi32(u[3], rnding);
 2783|  1.02M|  u[3] = _mm256_srai_epi32(u[3], bit);
 2784|       |
 2785|  1.02M|  v[0] = _mm256_mullo_epi32(v[6], cospi32);
 2786|  1.02M|  x = _mm256_mullo_epi32(v[7], cospi32);
 2787|  1.02M|  u[6] = _mm256_add_epi32(v[0], x);
 2788|  1.02M|  u[6] = _mm256_add_epi32(u[6], rnding);
 2789|  1.02M|  u[6] = _mm256_srai_epi32(u[6], bit);
 2790|       |
 2791|  1.02M|  u[7] = _mm256_sub_epi32(v[0], x);
 2792|  1.02M|  u[7] = _mm256_add_epi32(u[7], rnding);
 2793|  1.02M|  u[7] = _mm256_srai_epi32(u[7], bit);
 2794|       |
 2795|       |  // stage 7
 2796|  1.02M|  if (do_cols) {
  ------------------
  |  Branch (2796:7): [True: 570k, False: 455k]
  ------------------
 2797|   570k|    out[0] = u[0];
 2798|   570k|    out[1] = _mm256_sub_epi32(kZero, u[4]);
 2799|   570k|    out[2] = u[6];
 2800|   570k|    out[3] = _mm256_sub_epi32(kZero, u[2]);
 2801|   570k|    out[4] = u[3];
 2802|   570k|    out[5] = _mm256_sub_epi32(kZero, u[7]);
 2803|   570k|    out[6] = u[5];
 2804|   570k|    out[7] = _mm256_sub_epi32(kZero, u[1]);
 2805|   570k|  } else {
 2806|   455k|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|   455k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 455k]
  |  |  ------------------
  ------------------
 2807|   455k|    const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
 2808|   455k|    const __m256i clamp_hi_out =
 2809|   455k|        _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
 2810|       |
 2811|   455k|    neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
 2812|   455k|                   out_shift);
 2813|   455k|    neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
 2814|   455k|                   out_shift);
 2815|   455k|    neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
 2816|   455k|                   out_shift);
 2817|   455k|    neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
 2818|   455k|                   out_shift);
 2819|   455k|  }
 2820|  1.02M|}
highbd_inv_txfm_avx2.c:idct16_low1_avx2:
 1155|  1.41M|                             int bd, int out_shift) {
 1156|  1.41M|  const int32_t *cospi = cospi_arr(bit);
 1157|  1.41M|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
 1158|  1.41M|  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
 1159|  1.41M|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|  2.82M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 1.41M]
  |  |  |  Branch (35:31): [True: 1.10M, False: 309k]
  |  |  |  Branch (35:44): [True: 1.10M, False: 309k]
  |  |  ------------------
  ------------------
 1160|  1.41M|  __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
 1161|  1.41M|  __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
 1162|       |
 1163|  1.41M|  {
 1164|       |    // stage 0
 1165|       |    // stage 1
 1166|       |    // stage 2
 1167|       |    // stage 3
 1168|       |    // stage 4
 1169|  1.41M|    in[0] = _mm256_mullo_epi32(in[0], cospi32);
 1170|  1.41M|    in[0] = _mm256_add_epi32(in[0], rnding);
 1171|  1.41M|    in[0] = _mm256_srai_epi32(in[0], bit);
 1172|       |
 1173|       |    // stage 5
 1174|       |    // stage 6
 1175|       |    // stage 7
 1176|  1.41M|    if (!do_cols) {
  ------------------
  |  Branch (1176:9): [True: 310k, False: 1.10M]
  ------------------
 1177|   310k|      const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|   310k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 310k]
  |  |  ------------------
  ------------------
 1178|   310k|      clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
 1179|   310k|      clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
 1180|   310k|      __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
 1181|   310k|      in[0] = _mm256_add_epi32(in[0], offset);
 1182|   310k|      in[0] = _mm256_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
 1183|   310k|    }
 1184|  1.41M|    in[0] = _mm256_max_epi32(in[0], clamp_lo);
 1185|  1.41M|    in[0] = _mm256_min_epi32(in[0], clamp_hi);
 1186|  1.41M|    out[0] = in[0];
 1187|  1.41M|    out[1] = in[0];
 1188|  1.41M|    out[2] = in[0];
 1189|  1.41M|    out[3] = in[0];
 1190|  1.41M|    out[4] = in[0];
 1191|  1.41M|    out[5] = in[0];
 1192|  1.41M|    out[6] = in[0];
 1193|  1.41M|    out[7] = in[0];
 1194|  1.41M|    out[8] = in[0];
 1195|  1.41M|    out[9] = in[0];
 1196|  1.41M|    out[10] = in[0];
 1197|  1.41M|    out[11] = in[0];
 1198|  1.41M|    out[12] = in[0];
 1199|  1.41M|    out[13] = in[0];
 1200|  1.41M|    out[14] = in[0];
 1201|  1.41M|    out[15] = in[0];
 1202|  1.41M|  }
 1203|  1.41M|}
highbd_inv_txfm_avx2.c:idct16_low8_avx2:
 1206|  1.95M|                             int bd, int out_shift) {
 1207|  1.95M|  const int32_t *cospi = cospi_arr(bit);
 1208|  1.95M|  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
 1209|  1.95M|  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
 1210|  1.95M|  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
 1211|  1.95M|  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
 1212|  1.95M|  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
 1213|  1.95M|  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
 1214|  1.95M|  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
 1215|  1.95M|  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
 1216|  1.95M|  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
 1217|  1.95M|  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
 1218|  1.95M|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
 1219|  1.95M|  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
 1220|  1.95M|  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
 1221|  1.95M|  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
 1222|  1.95M|  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
 1223|  1.95M|  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
 1224|  1.95M|  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
 1225|  1.95M|  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
 1226|  1.95M|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|  3.91M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 1.95M]
  |  |  |  Branch (35:31): [True: 1.54M, False: 416k]
  |  |  |  Branch (35:44): [True: 1.54M, False: 416k]
  |  |  ------------------
  ------------------
 1227|  1.95M|  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
 1228|  1.95M|  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
 1229|  1.95M|  __m256i u[16], x, y;
 1230|       |
 1231|  1.95M|  {
 1232|       |    // stage 0
 1233|       |    // stage 1
 1234|  1.95M|    u[0] = in[0];
 1235|  1.95M|    u[2] = in[4];
 1236|  1.95M|    u[4] = in[2];
 1237|  1.95M|    u[6] = in[6];
 1238|  1.95M|    u[8] = in[1];
 1239|  1.95M|    u[10] = in[5];
 1240|  1.95M|    u[12] = in[3];
 1241|  1.95M|    u[14] = in[7];
 1242|       |
 1243|       |    // stage 2
 1244|  1.95M|    u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
 1245|  1.95M|    u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
 1246|       |
 1247|  1.95M|    u[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit);
 1248|  1.95M|    u[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit);
 1249|       |
 1250|  1.95M|    u[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit);
 1251|  1.95M|    u[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit);
 1252|       |
 1253|  1.95M|    u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
 1254|  1.95M|    u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
 1255|       |
 1256|       |    // stage 3
 1257|  1.95M|    u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit);
 1258|  1.95M|    u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit);
 1259|  1.95M|    u[5] = half_btf_0_avx2(&cospim40, &u[6], &rnding, bit);
 1260|  1.95M|    u[6] = half_btf_0_avx2(&cospi24, &u[6], &rnding, bit);
 1261|       |
 1262|  1.95M|    addsub_avx2(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
 1263|  1.95M|    addsub_avx2(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
 1264|  1.95M|    addsub_avx2(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
 1265|  1.95M|    addsub_avx2(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
 1266|       |
 1267|       |    // stage 4
 1268|  1.95M|    x = _mm256_mullo_epi32(u[0], cospi32);
 1269|  1.95M|    u[0] = _mm256_add_epi32(x, rnding);
 1270|  1.95M|    u[0] = _mm256_srai_epi32(u[0], bit);
 1271|  1.95M|    u[1] = u[0];
 1272|       |
 1273|  1.95M|    u[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit);
 1274|  1.95M|    u[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit);
 1275|       |
 1276|  1.95M|    addsub_avx2(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
 1277|  1.95M|    addsub_avx2(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
 1278|       |
 1279|  1.95M|    x = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
 1280|  1.95M|    u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
 1281|  1.95M|    u[9] = x;
 1282|  1.95M|    y = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
 1283|  1.95M|    u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
 1284|  1.95M|    u[10] = y;
 1285|       |
 1286|       |    // stage 5
 1287|  1.95M|    addsub_avx2(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
 1288|  1.95M|    addsub_avx2(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
 1289|       |
 1290|  1.95M|    x = _mm256_mullo_epi32(u[5], cospi32);
 1291|  1.95M|    y = _mm256_mullo_epi32(u[6], cospi32);
 1292|  1.95M|    u[5] = _mm256_sub_epi32(y, x);
 1293|  1.95M|    u[5] = _mm256_add_epi32(u[5], rnding);
 1294|  1.95M|    u[5] = _mm256_srai_epi32(u[5], bit);
 1295|       |
 1296|  1.95M|    u[6] = _mm256_add_epi32(y, x);
 1297|  1.95M|    u[6] = _mm256_add_epi32(u[6], rnding);
 1298|  1.95M|    u[6] = _mm256_srai_epi32(u[6], bit);
 1299|       |
 1300|  1.95M|    addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
 1301|  1.95M|    addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
 1302|  1.95M|    addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
 1303|  1.95M|    addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
 1304|       |
 1305|       |    // stage 6
 1306|  1.95M|    addsub_avx2(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
 1307|  1.95M|    addsub_avx2(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
 1308|  1.95M|    addsub_avx2(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
 1309|  1.95M|    addsub_avx2(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
 1310|       |
 1311|  1.95M|    x = _mm256_mullo_epi32(u[10], cospi32);
 1312|  1.95M|    y = _mm256_mullo_epi32(u[13], cospi32);
 1313|  1.95M|    u[10] = _mm256_sub_epi32(y, x);
 1314|  1.95M|    u[10] = _mm256_add_epi32(u[10], rnding);
 1315|  1.95M|    u[10] = _mm256_srai_epi32(u[10], bit);
 1316|       |
 1317|  1.95M|    u[13] = _mm256_add_epi32(x, y);
 1318|  1.95M|    u[13] = _mm256_add_epi32(u[13], rnding);
 1319|  1.95M|    u[13] = _mm256_srai_epi32(u[13], bit);
 1320|       |
 1321|  1.95M|    x = _mm256_mullo_epi32(u[11], cospi32);
 1322|  1.95M|    y = _mm256_mullo_epi32(u[12], cospi32);
 1323|  1.95M|    u[11] = _mm256_sub_epi32(y, x);
 1324|  1.95M|    u[11] = _mm256_add_epi32(u[11], rnding);
 1325|  1.95M|    u[11] = _mm256_srai_epi32(u[11], bit);
 1326|       |
 1327|  1.95M|    u[12] = _mm256_add_epi32(x, y);
 1328|  1.95M|    u[12] = _mm256_add_epi32(u[12], rnding);
 1329|  1.95M|    u[12] = _mm256_srai_epi32(u[12], bit);
 1330|       |    // stage 7
 1331|  1.95M|    addsub_avx2(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
 1332|  1.95M|    addsub_avx2(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
 1333|  1.95M|    addsub_avx2(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
 1334|  1.95M|    addsub_avx2(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
 1335|  1.95M|    addsub_avx2(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
 1336|  1.95M|    addsub_avx2(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
 1337|  1.95M|    addsub_avx2(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
 1338|  1.95M|    addsub_avx2(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
 1339|       |
 1340|  1.95M|    if (!do_cols) {
  ------------------
  |  Branch (1340:9): [True: 416k, False: 1.53M]
  ------------------
 1341|   416k|      const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|   416k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 416k]
  |  |  ------------------
  ------------------
 1342|   416k|      const __m256i clamp_lo_out =
 1343|   416k|          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
 1344|   416k|      const __m256i clamp_hi_out =
 1345|   416k|          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
 1346|   416k|      round_shift_8x8_avx2(out, out_shift);
 1347|   416k|      highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16);
 1348|   416k|    }
 1349|  1.95M|  }
 1350|  1.95M|}
highbd_inv_txfm_avx2.c:half_btf_0_avx2:
  242|  95.9M|                                      const __m256i *rounding, int bit) {
  243|  95.9M|  __m256i x;
  244|  95.9M|  x = _mm256_mullo_epi32(*w0, *n0);
  245|  95.9M|  x = _mm256_add_epi32(x, *rounding);
  246|  95.9M|  x = _mm256_srai_epi32(x, bit);
  247|  95.9M|  return x;
  248|  95.9M|}
highbd_inv_txfm_avx2.c:half_btf_avx2:
  252|   203M|                                    const __m256i *rounding, int bit) {
  253|   203M|  __m256i x, y;
  254|       |
  255|   203M|  x = _mm256_mullo_epi32(*w0, *n0);
  256|   203M|  y = _mm256_mullo_epi32(*w1, *n1);
  257|   203M|  x = _mm256_add_epi32(x, y);
  258|   203M|  x = _mm256_add_epi32(x, *rounding);
  259|   203M|  x = _mm256_srai_epi32(x, bit);
  260|   203M|  return x;
  261|   203M|}
highbd_inv_txfm_avx2.c:round_shift_8x8_avx2:
   65|  3.96M|static inline void round_shift_8x8_avx2(__m256i *in, int shift) {
   66|  3.96M|  round_shift_4x4_avx2(in, shift);
   67|  3.96M|  round_shift_4x4_avx2(in + 4, shift);
   68|  3.96M|  round_shift_4x4_avx2(in + 8, shift);
   69|  3.96M|  round_shift_4x4_avx2(in + 12, shift);
   70|  3.96M|}
highbd_inv_txfm_avx2.c:idct16_avx2:
 1353|  1.16M|                        int out_shift) {
 1354|  1.16M|  const int32_t *cospi = cospi_arr(bit);
 1355|  1.16M|  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
 1356|  1.16M|  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
 1357|  1.16M|  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
 1358|  1.16M|  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
 1359|  1.16M|  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
 1360|  1.16M|  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
 1361|  1.16M|  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
 1362|  1.16M|  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
 1363|  1.16M|  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
 1364|  1.16M|  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
 1365|  1.16M|  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
 1366|  1.16M|  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
 1367|  1.16M|  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
 1368|  1.16M|  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
 1369|  1.16M|  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
 1370|  1.16M|  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
 1371|  1.16M|  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
 1372|  1.16M|  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
 1373|  1.16M|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
 1374|  1.16M|  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
 1375|  1.16M|  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
 1376|  1.16M|  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
 1377|  1.16M|  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
 1378|  1.16M|  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
 1379|  1.16M|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|  2.33M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 1.16M]
  |  |  |  Branch (35:31): [True: 891k, False: 273k]
  |  |  |  Branch (35:44): [True: 891k, False: 273k]
  |  |  ------------------
  ------------------
 1380|  1.16M|  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
 1381|  1.16M|  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
 1382|  1.16M|  __m256i u[16], v[16], x, y;
 1383|       |
 1384|  1.16M|  {
 1385|       |    // stage 0
 1386|       |    // stage 1
 1387|  1.16M|    u[0] = in[0];
 1388|  1.16M|    u[1] = in[8];
 1389|  1.16M|    u[2] = in[4];
 1390|  1.16M|    u[3] = in[12];
 1391|  1.16M|    u[4] = in[2];
 1392|  1.16M|    u[5] = in[10];
 1393|  1.16M|    u[6] = in[6];
 1394|  1.16M|    u[7] = in[14];
 1395|  1.16M|    u[8] = in[1];
 1396|  1.16M|    u[9] = in[9];
 1397|  1.16M|    u[10] = in[5];
 1398|  1.16M|    u[11] = in[13];
 1399|  1.16M|    u[12] = in[3];
 1400|  1.16M|    u[13] = in[11];
 1401|  1.16M|    u[14] = in[7];
 1402|  1.16M|    u[15] = in[15];
 1403|       |
 1404|       |    // stage 2
 1405|  1.16M|    v[0] = u[0];
 1406|  1.16M|    v[1] = u[1];
 1407|  1.16M|    v[2] = u[2];
 1408|  1.16M|    v[3] = u[3];
 1409|  1.16M|    v[4] = u[4];
 1410|  1.16M|    v[5] = u[5];
 1411|  1.16M|    v[6] = u[6];
 1412|  1.16M|    v[7] = u[7];
 1413|       |
 1414|  1.16M|    v[8] = half_btf_avx2(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
 1415|  1.16M|    v[9] = half_btf_avx2(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
 1416|  1.16M|    v[10] = half_btf_avx2(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
 1417|  1.16M|    v[11] = half_btf_avx2(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
 1418|  1.16M|    v[12] = half_btf_avx2(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
 1419|  1.16M|    v[13] = half_btf_avx2(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
 1420|  1.16M|    v[14] = half_btf_avx2(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
 1421|  1.16M|    v[15] = half_btf_avx2(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
 1422|       |
 1423|       |    // stage 3
 1424|  1.16M|    u[0] = v[0];
 1425|  1.16M|    u[1] = v[1];
 1426|  1.16M|    u[2] = v[2];
 1427|  1.16M|    u[3] = v[3];
 1428|  1.16M|    u[4] = half_btf_avx2(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
 1429|  1.16M|    u[5] = half_btf_avx2(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
 1430|  1.16M|    u[6] = half_btf_avx2(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
 1431|  1.16M|    u[7] = half_btf_avx2(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
 1432|  1.16M|    addsub_avx2(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
 1433|  1.16M|    addsub_avx2(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
 1434|  1.16M|    addsub_avx2(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
 1435|  1.16M|    addsub_avx2(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
 1436|       |
 1437|       |    // stage 4
 1438|  1.16M|    x = _mm256_mullo_epi32(u[0], cospi32);
 1439|  1.16M|    y = _mm256_mullo_epi32(u[1], cospi32);
 1440|  1.16M|    v[0] = _mm256_add_epi32(x, y);
 1441|  1.16M|    v[0] = _mm256_add_epi32(v[0], rnding);
 1442|  1.16M|    v[0] = _mm256_srai_epi32(v[0], bit);
 1443|       |
 1444|  1.16M|    v[1] = _mm256_sub_epi32(x, y);
 1445|  1.16M|    v[1] = _mm256_add_epi32(v[1], rnding);
 1446|  1.16M|    v[1] = _mm256_srai_epi32(v[1], bit);
 1447|       |
 1448|  1.16M|    v[2] = half_btf_avx2(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
 1449|  1.16M|    v[3] = half_btf_avx2(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
 1450|  1.16M|    addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
 1451|  1.16M|    addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
 1452|  1.16M|    v[8] = u[8];
 1453|  1.16M|    v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
 1454|  1.16M|    v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
 1455|  1.16M|    v[11] = u[11];
 1456|  1.16M|    v[12] = u[12];
 1457|  1.16M|    v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
 1458|  1.16M|    v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
 1459|  1.16M|    v[15] = u[15];
 1460|       |
 1461|       |    // stage 5
 1462|  1.16M|    addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
 1463|  1.16M|    addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
 1464|  1.16M|    u[4] = v[4];
 1465|       |
 1466|  1.16M|    x = _mm256_mullo_epi32(v[5], cospi32);
 1467|  1.16M|    y = _mm256_mullo_epi32(v[6], cospi32);
 1468|  1.16M|    u[5] = _mm256_sub_epi32(y, x);
 1469|  1.16M|    u[5] = _mm256_add_epi32(u[5], rnding);
 1470|  1.16M|    u[5] = _mm256_srai_epi32(u[5], bit);
 1471|       |
 1472|  1.16M|    u[6] = _mm256_add_epi32(y, x);
 1473|  1.16M|    u[6] = _mm256_add_epi32(u[6], rnding);
 1474|  1.16M|    u[6] = _mm256_srai_epi32(u[6], bit);
 1475|       |
 1476|  1.16M|    u[7] = v[7];
 1477|  1.16M|    addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
 1478|  1.16M|    addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
 1479|  1.16M|    addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
 1480|  1.16M|    addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
 1481|       |
 1482|       |    // stage 6
 1483|  1.16M|    addsub_avx2(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
 1484|  1.16M|    addsub_avx2(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
 1485|  1.16M|    addsub_avx2(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
 1486|  1.16M|    addsub_avx2(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
 1487|  1.16M|    v[8] = u[8];
 1488|  1.16M|    v[9] = u[9];
 1489|       |
 1490|  1.16M|    x = _mm256_mullo_epi32(u[10], cospi32);
 1491|  1.16M|    y = _mm256_mullo_epi32(u[13], cospi32);
 1492|  1.16M|    v[10] = _mm256_sub_epi32(y, x);
 1493|  1.16M|    v[10] = _mm256_add_epi32(v[10], rnding);
 1494|  1.16M|    v[10] = _mm256_srai_epi32(v[10], bit);
 1495|       |
 1496|  1.16M|    v[13] = _mm256_add_epi32(x, y);
 1497|  1.16M|    v[13] = _mm256_add_epi32(v[13], rnding);
 1498|  1.16M|    v[13] = _mm256_srai_epi32(v[13], bit);
 1499|       |
 1500|  1.16M|    x = _mm256_mullo_epi32(u[11], cospi32);
 1501|  1.16M|    y = _mm256_mullo_epi32(u[12], cospi32);
 1502|  1.16M|    v[11] = _mm256_sub_epi32(y, x);
 1503|  1.16M|    v[11] = _mm256_add_epi32(v[11], rnding);
 1504|  1.16M|    v[11] = _mm256_srai_epi32(v[11], bit);
 1505|       |
 1506|  1.16M|    v[12] = _mm256_add_epi32(x, y);
 1507|  1.16M|    v[12] = _mm256_add_epi32(v[12], rnding);
 1508|  1.16M|    v[12] = _mm256_srai_epi32(v[12], bit);
 1509|       |
 1510|  1.16M|    v[14] = u[14];
 1511|  1.16M|    v[15] = u[15];
 1512|       |
 1513|       |    // stage 7
 1514|  1.16M|    addsub_avx2(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
 1515|  1.16M|    addsub_avx2(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
 1516|  1.16M|    addsub_avx2(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
 1517|  1.16M|    addsub_avx2(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
 1518|  1.16M|    addsub_avx2(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
 1519|  1.16M|    addsub_avx2(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
 1520|  1.16M|    addsub_avx2(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
 1521|  1.16M|    addsub_avx2(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
 1522|       |
 1523|  1.16M|    if (!do_cols) {
  ------------------
  |  Branch (1523:9): [True: 273k, False: 891k]
  ------------------
 1524|   273k|      const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|   273k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 273k]
  |  |  ------------------
  ------------------
 1525|   273k|      const __m256i clamp_lo_out =
 1526|   273k|          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
 1527|   273k|      const __m256i clamp_hi_out =
 1528|   273k|          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
 1529|   273k|      round_shift_8x8_avx2(out, out_shift);
 1530|   273k|      highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16);
 1531|   273k|    }
 1532|  1.16M|  }
 1533|  1.16M|}
highbd_inv_txfm_avx2.c:iadst16_low1_avx2:
 1536|   274k|                              int bd, int out_shift) {
 1537|   274k|  const int32_t *cospi = cospi_arr(bit);
 1538|   274k|  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
 1539|   274k|  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
 1540|   274k|  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
 1541|   274k|  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
 1542|   274k|  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
 1543|   274k|  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
 1544|   274k|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
 1545|   274k|  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
 1546|   274k|  const __m256i zero = _mm256_setzero_si256();
 1547|   274k|  __m256i v[16], x, y, temp1, temp2;
 1548|       |
 1549|       |  // Calculate the column 0, 1, 2, 3
 1550|   274k|  {
 1551|       |    // stage 0
 1552|       |    // stage 1
 1553|       |    // stage 2
 1554|   274k|    x = _mm256_mullo_epi32(in[0], cospi62);
 1555|   274k|    v[0] = _mm256_add_epi32(x, rnding);
 1556|   274k|    v[0] = _mm256_srai_epi32(v[0], bit);
 1557|       |
 1558|   274k|    x = _mm256_mullo_epi32(in[0], cospi2);
 1559|   274k|    v[1] = _mm256_sub_epi32(zero, x);
 1560|   274k|    v[1] = _mm256_add_epi32(v[1], rnding);
 1561|   274k|    v[1] = _mm256_srai_epi32(v[1], bit);
 1562|       |
 1563|       |    // stage 3
 1564|   274k|    v[8] = v[0];
 1565|   274k|    v[9] = v[1];
 1566|       |
 1567|       |    // stage 4
 1568|   274k|    temp1 = _mm256_mullo_epi32(v[8], cospi8);
 1569|   274k|    x = _mm256_mullo_epi32(v[9], cospi56);
 1570|   274k|    temp1 = _mm256_add_epi32(temp1, x);
 1571|   274k|    temp1 = _mm256_add_epi32(temp1, rnding);
 1572|   274k|    temp1 = _mm256_srai_epi32(temp1, bit);
 1573|       |
 1574|   274k|    temp2 = _mm256_mullo_epi32(v[8], cospi56);
 1575|   274k|    x = _mm256_mullo_epi32(v[9], cospi8);
 1576|   274k|    temp2 = _mm256_sub_epi32(temp2, x);
 1577|   274k|    temp2 = _mm256_add_epi32(temp2, rnding);
 1578|   274k|    temp2 = _mm256_srai_epi32(temp2, bit);
 1579|   274k|    v[8] = temp1;
 1580|   274k|    v[9] = temp2;
 1581|       |
 1582|       |    // stage 5
 1583|   274k|    v[4] = v[0];
 1584|   274k|    v[5] = v[1];
 1585|   274k|    v[12] = v[8];
 1586|   274k|    v[13] = v[9];
 1587|       |
 1588|       |    // stage 6
 1589|   274k|    temp1 = _mm256_mullo_epi32(v[4], cospi16);
 1590|   274k|    x = _mm256_mullo_epi32(v[5], cospi48);
 1591|   274k|    temp1 = _mm256_add_epi32(temp1, x);
 1592|   274k|    temp1 = _mm256_add_epi32(temp1, rnding);
 1593|   274k|    temp1 = _mm256_srai_epi32(temp1, bit);
 1594|       |
 1595|   274k|    temp2 = _mm256_mullo_epi32(v[4], cospi48);
 1596|   274k|    x = _mm256_mullo_epi32(v[5], cospi16);
 1597|   274k|    temp2 = _mm256_sub_epi32(temp2, x);
 1598|   274k|    temp2 = _mm256_add_epi32(temp2, rnding);
 1599|   274k|    temp2 = _mm256_srai_epi32(temp2, bit);
 1600|   274k|    v[4] = temp1;
 1601|   274k|    v[5] = temp2;
 1602|       |
 1603|   274k|    temp1 = _mm256_mullo_epi32(v[12], cospi16);
 1604|   274k|    x = _mm256_mullo_epi32(v[13], cospi48);
 1605|   274k|    temp1 = _mm256_add_epi32(temp1, x);
 1606|   274k|    temp1 = _mm256_add_epi32(temp1, rnding);
 1607|   274k|    temp1 = _mm256_srai_epi32(temp1, bit);
 1608|       |
 1609|   274k|    temp2 = _mm256_mullo_epi32(v[12], cospi48);
 1610|   274k|    x = _mm256_mullo_epi32(v[13], cospi16);
 1611|   274k|    temp2 = _mm256_sub_epi32(temp2, x);
 1612|   274k|    temp2 = _mm256_add_epi32(temp2, rnding);
 1613|   274k|    temp2 = _mm256_srai_epi32(temp2, bit);
 1614|   274k|    v[12] = temp1;
 1615|   274k|    v[13] = temp2;
 1616|       |
 1617|       |    // stage 7
 1618|   274k|    v[2] = v[0];
 1619|   274k|    v[3] = v[1];
 1620|   274k|    v[6] = v[4];
 1621|   274k|    v[7] = v[5];
 1622|   274k|    v[10] = v[8];
 1623|   274k|    v[11] = v[9];
 1624|   274k|    v[14] = v[12];
 1625|   274k|    v[15] = v[13];
 1626|       |
 1627|       |    // stage 8
 1628|   274k|    y = _mm256_mullo_epi32(v[2], cospi32);
 1629|   274k|    x = _mm256_mullo_epi32(v[3], cospi32);
 1630|   274k|    v[2] = _mm256_add_epi32(y, x);
 1631|   274k|    v[2] = _mm256_add_epi32(v[2], rnding);
 1632|   274k|    v[2] = _mm256_srai_epi32(v[2], bit);
 1633|       |
 1634|   274k|    v[3] = _mm256_sub_epi32(y, x);
 1635|   274k|    v[3] = _mm256_add_epi32(v[3], rnding);
 1636|   274k|    v[3] = _mm256_srai_epi32(v[3], bit);
 1637|       |
 1638|   274k|    y = _mm256_mullo_epi32(v[6], cospi32);
 1639|   274k|    x = _mm256_mullo_epi32(v[7], cospi32);
 1640|   274k|    v[6] = _mm256_add_epi32(y, x);
 1641|   274k|    v[6] = _mm256_add_epi32(v[6], rnding);
 1642|   274k|    v[6] = _mm256_srai_epi32(v[6], bit);
 1643|       |
 1644|   274k|    v[7] = _mm256_sub_epi32(y, x);
 1645|   274k|    v[7] = _mm256_add_epi32(v[7], rnding);
 1646|   274k|    v[7] = _mm256_srai_epi32(v[7], bit);
 1647|       |
 1648|   274k|    y = _mm256_mullo_epi32(v[10], cospi32);
 1649|   274k|    x = _mm256_mullo_epi32(v[11], cospi32);
 1650|   274k|    v[10] = _mm256_add_epi32(y, x);
 1651|   274k|    v[10] = _mm256_add_epi32(v[10], rnding);
 1652|   274k|    v[10] = _mm256_srai_epi32(v[10], bit);
 1653|       |
 1654|   274k|    v[11] = _mm256_sub_epi32(y, x);
 1655|   274k|    v[11] = _mm256_add_epi32(v[11], rnding);
 1656|   274k|    v[11] = _mm256_srai_epi32(v[11], bit);
 1657|       |
 1658|   274k|    y = _mm256_mullo_epi32(v[14], cospi32);
 1659|   274k|    x = _mm256_mullo_epi32(v[15], cospi32);
 1660|   274k|    v[14] = _mm256_add_epi32(y, x);
 1661|   274k|    v[14] = _mm256_add_epi32(v[14], rnding);
 1662|   274k|    v[14] = _mm256_srai_epi32(v[14], bit);
 1663|       |
 1664|   274k|    v[15] = _mm256_sub_epi32(y, x);
 1665|   274k|    v[15] = _mm256_add_epi32(v[15], rnding);
 1666|   274k|    v[15] = _mm256_srai_epi32(v[15], bit);
 1667|       |
 1668|       |    // stage 9
 1669|   274k|    if (do_cols) {
  ------------------
  |  Branch (1669:9): [True: 134k, False: 139k]
  ------------------
 1670|   134k|      out[0] = v[0];
 1671|   134k|      out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]);
 1672|   134k|      out[2] = v[12];
 1673|   134k|      out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]);
 1674|   134k|      out[4] = v[6];
 1675|   134k|      out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]);
 1676|   134k|      out[6] = v[10];
 1677|   134k|      out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]);
 1678|   134k|      out[8] = v[3];
 1679|   134k|      out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]);
 1680|   134k|      out[10] = v[15];
 1681|   134k|      out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]);
 1682|   134k|      out[12] = v[5];
 1683|   134k|      out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]);
 1684|   134k|      out[14] = v[9];
 1685|   134k|      out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]);
 1686|   139k|    } else {
 1687|   139k|      const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|   139k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 139k]
  |  |  ------------------
  ------------------
 1688|   139k|      const __m256i clamp_lo_out =
 1689|   139k|          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
 1690|   139k|      const __m256i clamp_hi_out =
 1691|   139k|          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
 1692|       |
 1693|   139k|      neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
 1694|   139k|                     out_shift);
 1695|   139k|      neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
 1696|   139k|                     &clamp_hi_out, out_shift);
 1697|   139k|      neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
 1698|   139k|                     &clamp_hi_out, out_shift);
 1699|   139k|      neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
 1700|   139k|                     &clamp_hi_out, out_shift);
 1701|   139k|      neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
 1702|   139k|                     &clamp_hi_out, out_shift);
 1703|   139k|      neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
 1704|   139k|                     &clamp_hi_out, out_shift);
 1705|   139k|      neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
 1706|   139k|                     &clamp_hi_out, out_shift);
 1707|   139k|      neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
 1708|   139k|                     &clamp_hi_out, out_shift);
 1709|   139k|    }
 1710|   274k|  }
 1711|   274k|}
highbd_inv_txfm_avx2.c:iadst16_low8_avx2:
 1714|   692k|                              int bd, int out_shift) {
 1715|   692k|  const int32_t *cospi = cospi_arr(bit);
 1716|   692k|  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
 1717|   692k|  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
 1718|   692k|  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
 1719|   692k|  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
 1720|   692k|  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
 1721|   692k|  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
 1722|   692k|  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
 1723|   692k|  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
 1724|   692k|  const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
 1725|   692k|  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
 1726|   692k|  const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
 1727|   692k|  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
 1728|   692k|  const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
 1729|   692k|  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
 1730|   692k|  const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
 1731|   692k|  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
 1732|   692k|  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
 1733|   692k|  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
 1734|   692k|  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
 1735|   692k|  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
 1736|   692k|  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
 1737|   692k|  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
 1738|   692k|  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
 1739|   692k|  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
 1740|   692k|  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
 1741|   692k|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
 1742|   692k|  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
 1743|   692k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|  1.38M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 692k]
  |  |  |  Branch (35:31): [True: 351k, False: 340k]
  |  |  |  Branch (35:44): [True: 351k, False: 340k]
  |  |  ------------------
  ------------------
 1744|   692k|  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
 1745|   692k|  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
 1746|   692k|  __m256i u[16], x, y;
 1747|       |
 1748|   692k|  {
 1749|       |    // stage 0
 1750|       |    // stage 1
 1751|       |    // stage 2
 1752|   692k|    __m256i zero = _mm256_setzero_si256();
 1753|   692k|    x = _mm256_mullo_epi32(in[0], cospi62);
 1754|   692k|    u[0] = _mm256_add_epi32(x, rnding);
 1755|   692k|    u[0] = _mm256_srai_epi32(u[0], bit);
 1756|       |
 1757|   692k|    x = _mm256_mullo_epi32(in[0], cospi2);
 1758|   692k|    u[1] = _mm256_sub_epi32(zero, x);
 1759|   692k|    u[1] = _mm256_add_epi32(u[1], rnding);
 1760|   692k|    u[1] = _mm256_srai_epi32(u[1], bit);
 1761|       |
 1762|   692k|    x = _mm256_mullo_epi32(in[2], cospi54);
 1763|   692k|    u[2] = _mm256_add_epi32(x, rnding);
 1764|   692k|    u[2] = _mm256_srai_epi32(u[2], bit);
 1765|       |
 1766|   692k|    x = _mm256_mullo_epi32(in[2], cospi10);
 1767|   692k|    u[3] = _mm256_sub_epi32(zero, x);
 1768|   692k|    u[3] = _mm256_add_epi32(u[3], rnding);
 1769|   692k|    u[3] = _mm256_srai_epi32(u[3], bit);
 1770|       |
 1771|   692k|    x = _mm256_mullo_epi32(in[4], cospi46);
 1772|   692k|    u[4] = _mm256_add_epi32(x, rnding);
 1773|   692k|    u[4] = _mm256_srai_epi32(u[4], bit);
 1774|       |
 1775|   692k|    x = _mm256_mullo_epi32(in[4], cospi18);
 1776|   692k|    u[5] = _mm256_sub_epi32(zero, x);
 1777|   692k|    u[5] = _mm256_add_epi32(u[5], rnding);
 1778|   692k|    u[5] = _mm256_srai_epi32(u[5], bit);
 1779|       |
 1780|   692k|    x = _mm256_mullo_epi32(in[6], cospi38);
 1781|   692k|    u[6] = _mm256_add_epi32(x, rnding);
 1782|   692k|    u[6] = _mm256_srai_epi32(u[6], bit);
 1783|       |
 1784|   692k|    x = _mm256_mullo_epi32(in[6], cospi26);
 1785|   692k|    u[7] = _mm256_sub_epi32(zero, x);
 1786|   692k|    u[7] = _mm256_add_epi32(u[7], rnding);
 1787|   692k|    u[7] = _mm256_srai_epi32(u[7], bit);
 1788|       |
 1789|   692k|    u[8] = _mm256_mullo_epi32(in[7], cospi34);
 1790|   692k|    u[8] = _mm256_add_epi32(u[8], rnding);
 1791|   692k|    u[8] = _mm256_srai_epi32(u[8], bit);
 1792|       |
 1793|   692k|    u[9] = _mm256_mullo_epi32(in[7], cospi30);
 1794|   692k|    u[9] = _mm256_add_epi32(u[9], rnding);
 1795|   692k|    u[9] = _mm256_srai_epi32(u[9], bit);
 1796|       |
 1797|   692k|    u[10] = _mm256_mullo_epi32(in[5], cospi42);
 1798|   692k|    u[10] = _mm256_add_epi32(u[10], rnding);
 1799|   692k|    u[10] = _mm256_srai_epi32(u[10], bit);
 1800|       |
 1801|   692k|    u[11] = _mm256_mullo_epi32(in[5], cospi22);
 1802|   692k|    u[11] = _mm256_add_epi32(u[11], rnding);
 1803|   692k|    u[11] = _mm256_srai_epi32(u[11], bit);
 1804|       |
 1805|   692k|    u[12] = _mm256_mullo_epi32(in[3], cospi50);
 1806|   692k|    u[12] = _mm256_add_epi32(u[12], rnding);
 1807|   692k|    u[12] = _mm256_srai_epi32(u[12], bit);
 1808|       |
 1809|   692k|    u[13] = _mm256_mullo_epi32(in[3], cospi14);
 1810|   692k|    u[13] = _mm256_add_epi32(u[13], rnding);
 1811|   692k|    u[13] = _mm256_srai_epi32(u[13], bit);
 1812|       |
 1813|   692k|    u[14] = _mm256_mullo_epi32(in[1], cospi58);
 1814|   692k|    u[14] = _mm256_add_epi32(u[14], rnding);
 1815|   692k|    u[14] = _mm256_srai_epi32(u[14], bit);
 1816|       |
 1817|   692k|    u[15] = _mm256_mullo_epi32(in[1], cospi6);
 1818|   692k|    u[15] = _mm256_add_epi32(u[15], rnding);
 1819|   692k|    u[15] = _mm256_srai_epi32(u[15], bit);
 1820|       |
 1821|       |    // stage 3
 1822|   692k|    addsub_avx2(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
 1823|   692k|    addsub_avx2(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
 1824|   692k|    addsub_avx2(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
 1825|   692k|    addsub_avx2(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
 1826|   692k|    addsub_avx2(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
 1827|   692k|    addsub_avx2(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
 1828|   692k|    addsub_avx2(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
 1829|   692k|    addsub_avx2(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
 1830|       |
 1831|       |    // stage 4
 1832|   692k|    y = _mm256_mullo_epi32(u[8], cospi56);
 1833|   692k|    x = _mm256_mullo_epi32(u[9], cospi56);
 1834|   692k|    u[8] = _mm256_mullo_epi32(u[8], cospi8);
 1835|   692k|    u[8] = _mm256_add_epi32(u[8], x);
 1836|   692k|    u[8] = _mm256_add_epi32(u[8], rnding);
 1837|   692k|    u[8] = _mm256_srai_epi32(u[8], bit);
 1838|       |
 1839|   692k|    x = _mm256_mullo_epi32(u[9], cospi8);
 1840|   692k|    u[9] = _mm256_sub_epi32(y, x);
 1841|   692k|    u[9] = _mm256_add_epi32(u[9], rnding);
 1842|   692k|    u[9] = _mm256_srai_epi32(u[9], bit);
 1843|       |
 1844|   692k|    x = _mm256_mullo_epi32(u[11], cospi24);
 1845|   692k|    y = _mm256_mullo_epi32(u[10], cospi24);
 1846|   692k|    u[10] = _mm256_mullo_epi32(u[10], cospi40);
 1847|   692k|    u[10] = _mm256_add_epi32(u[10], x);
 1848|   692k|    u[10] = _mm256_add_epi32(u[10], rnding);
 1849|   692k|    u[10] = _mm256_srai_epi32(u[10], bit);
 1850|       |
 1851|   692k|    x = _mm256_mullo_epi32(u[11], cospi40);
 1852|   692k|    u[11] = _mm256_sub_epi32(y, x);
 1853|   692k|    u[11] = _mm256_add_epi32(u[11], rnding);
 1854|   692k|    u[11] = _mm256_srai_epi32(u[11], bit);
 1855|       |
 1856|   692k|    x = _mm256_mullo_epi32(u[13], cospi8);
 1857|   692k|    y = _mm256_mullo_epi32(u[12], cospi8);
 1858|   692k|    u[12] = _mm256_mullo_epi32(u[12], cospim56);
 1859|   692k|    u[12] = _mm256_add_epi32(u[12], x);
 1860|   692k|    u[12] = _mm256_add_epi32(u[12], rnding);
 1861|   692k|    u[12] = _mm256_srai_epi32(u[12], bit);
 1862|       |
 1863|   692k|    x = _mm256_mullo_epi32(u[13], cospim56);
 1864|   692k|    u[13] = _mm256_sub_epi32(y, x);
 1865|   692k|    u[13] = _mm256_add_epi32(u[13], rnding);
 1866|   692k|    u[13] = _mm256_srai_epi32(u[13], bit);
 1867|       |
 1868|   692k|    x = _mm256_mullo_epi32(u[15], cospi40);
 1869|   692k|    y = _mm256_mullo_epi32(u[14], cospi40);
 1870|   692k|    u[14] = _mm256_mullo_epi32(u[14], cospim24);
 1871|   692k|    u[14] = _mm256_add_epi32(u[14], x);
 1872|   692k|    u[14] = _mm256_add_epi32(u[14], rnding);
 1873|   692k|    u[14] = _mm256_srai_epi32(u[14], bit);
 1874|       |
 1875|   692k|    x = _mm256_mullo_epi32(u[15], cospim24);
 1876|   692k|    u[15] = _mm256_sub_epi32(y, x);
 1877|   692k|    u[15] = _mm256_add_epi32(u[15], rnding);
 1878|   692k|    u[15] = _mm256_srai_epi32(u[15], bit);
 1879|       |
 1880|       |    // stage 5
 1881|   692k|    addsub_avx2(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
 1882|   692k|    addsub_avx2(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
 1883|   692k|    addsub_avx2(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
 1884|   692k|    addsub_avx2(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
 1885|   692k|    addsub_avx2(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
 1886|   692k|    addsub_avx2(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
 1887|   692k|    addsub_avx2(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
 1888|   692k|    addsub_avx2(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
 1889|       |
 1890|       |    // stage 6
 1891|   692k|    x = _mm256_mullo_epi32(u[5], cospi48);
 1892|   692k|    y = _mm256_mullo_epi32(u[4], cospi48);
 1893|   692k|    u[4] = _mm256_mullo_epi32(u[4], cospi16);
 1894|   692k|    u[4] = _mm256_add_epi32(u[4], x);
 1895|   692k|    u[4] = _mm256_add_epi32(u[4], rnding);
 1896|   692k|    u[4] = _mm256_srai_epi32(u[4], bit);
 1897|       |
 1898|   692k|    x = _mm256_mullo_epi32(u[5], cospi16);
 1899|   692k|    u[5] = _mm256_sub_epi32(y, x);
 1900|   692k|    u[5] = _mm256_add_epi32(u[5], rnding);
 1901|   692k|    u[5] = _mm256_srai_epi32(u[5], bit);
 1902|       |
 1903|   692k|    x = _mm256_mullo_epi32(u[7], cospi16);
 1904|   692k|    y = _mm256_mullo_epi32(u[6], cospi16);
 1905|   692k|    u[6] = _mm256_mullo_epi32(u[6], cospim48);
 1906|   692k|    u[6] = _mm256_add_epi32(u[6], x);
 1907|   692k|    u[6] = _mm256_add_epi32(u[6], rnding);
 1908|   692k|    u[6] = _mm256_srai_epi32(u[6], bit);
 1909|       |
 1910|   692k|    x = _mm256_mullo_epi32(u[7], cospim48);
 1911|   692k|    u[7] = _mm256_sub_epi32(y, x);
 1912|   692k|    u[7] = _mm256_add_epi32(u[7], rnding);
 1913|   692k|    u[7] = _mm256_srai_epi32(u[7], bit);
 1914|       |
 1915|   692k|    x = _mm256_mullo_epi32(u[13], cospi48);
 1916|   692k|    y = _mm256_mullo_epi32(u[12], cospi48);
 1917|   692k|    u[12] = _mm256_mullo_epi32(u[12], cospi16);
 1918|   692k|    u[12] = _mm256_add_epi32(u[12], x);
 1919|   692k|    u[12] = _mm256_add_epi32(u[12], rnding);
 1920|   692k|    u[12] = _mm256_srai_epi32(u[12], bit);
 1921|       |
 1922|   692k|    x = _mm256_mullo_epi32(u[13], cospi16);
 1923|   692k|    u[13] = _mm256_sub_epi32(y, x);
 1924|   692k|    u[13] = _mm256_add_epi32(u[13], rnding);
 1925|   692k|    u[13] = _mm256_srai_epi32(u[13], bit);
 1926|       |
 1927|   692k|    x = _mm256_mullo_epi32(u[15], cospi16);
 1928|   692k|    y = _mm256_mullo_epi32(u[14], cospi16);
 1929|   692k|    u[14] = _mm256_mullo_epi32(u[14], cospim48);
 1930|   692k|    u[14] = _mm256_add_epi32(u[14], x);
 1931|   692k|    u[14] = _mm256_add_epi32(u[14], rnding);
 1932|   692k|    u[14] = _mm256_srai_epi32(u[14], bit);
 1933|       |
 1934|   692k|    x = _mm256_mullo_epi32(u[15], cospim48);
 1935|   692k|    u[15] = _mm256_sub_epi32(y, x);
 1936|   692k|    u[15] = _mm256_add_epi32(u[15], rnding);
 1937|   692k|    u[15] = _mm256_srai_epi32(u[15], bit);
 1938|       |
 1939|       |    // stage 7
 1940|   692k|    addsub_avx2(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
 1941|   692k|    addsub_avx2(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
 1942|   692k|    addsub_avx2(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
 1943|   692k|    addsub_avx2(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
 1944|   692k|    addsub_avx2(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
 1945|   692k|    addsub_avx2(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
 1946|   692k|    addsub_avx2(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
 1947|   692k|    addsub_avx2(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
 1948|       |
 1949|       |    // stage 8
 1950|   692k|    y = _mm256_mullo_epi32(u[2], cospi32);
 1951|   692k|    x = _mm256_mullo_epi32(u[3], cospi32);
 1952|   692k|    u[2] = _mm256_add_epi32(y, x);
 1953|   692k|    u[2] = _mm256_add_epi32(u[2], rnding);
 1954|   692k|    u[2] = _mm256_srai_epi32(u[2], bit);
 1955|       |
 1956|   692k|    u[3] = _mm256_sub_epi32(y, x);
 1957|   692k|    u[3] = _mm256_add_epi32(u[3], rnding);
 1958|   692k|    u[3] = _mm256_srai_epi32(u[3], bit);
 1959|   692k|    y = _mm256_mullo_epi32(u[6], cospi32);
 1960|   692k|    x = _mm256_mullo_epi32(u[7], cospi32);
 1961|   692k|    u[6] = _mm256_add_epi32(y, x);
 1962|   692k|    u[6] = _mm256_add_epi32(u[6], rnding);
 1963|   692k|    u[6] = _mm256_srai_epi32(u[6], bit);
 1964|       |
 1965|   692k|    u[7] = _mm256_sub_epi32(y, x);
 1966|   692k|    u[7] = _mm256_add_epi32(u[7], rnding);
 1967|   692k|    u[7] = _mm256_srai_epi32(u[7], bit);
 1968|       |
 1969|   692k|    y = _mm256_mullo_epi32(u[10], cospi32);
 1970|   692k|    x = _mm256_mullo_epi32(u[11], cospi32);
 1971|   692k|    u[10] = _mm256_add_epi32(y, x);
 1972|   692k|    u[10] = _mm256_add_epi32(u[10], rnding);
 1973|   692k|    u[10] = _mm256_srai_epi32(u[10], bit);
 1974|       |
 1975|   692k|    u[11] = _mm256_sub_epi32(y, x);
 1976|   692k|    u[11] = _mm256_add_epi32(u[11], rnding);
 1977|   692k|    u[11] = _mm256_srai_epi32(u[11], bit);
 1978|       |
 1979|   692k|    y = _mm256_mullo_epi32(u[14], cospi32);
 1980|   692k|    x = _mm256_mullo_epi32(u[15], cospi32);
 1981|   692k|    u[14] = _mm256_add_epi32(y, x);
 1982|   692k|    u[14] = _mm256_add_epi32(u[14], rnding);
 1983|   692k|    u[14] = _mm256_srai_epi32(u[14], bit);
 1984|       |
 1985|   692k|    u[15] = _mm256_sub_epi32(y, x);
 1986|   692k|    u[15] = _mm256_add_epi32(u[15], rnding);
 1987|   692k|    u[15] = _mm256_srai_epi32(u[15], bit);
 1988|       |
 1989|       |    // stage 9
 1990|   692k|    if (do_cols) {
  ------------------
  |  Branch (1990:9): [True: 351k, False: 340k]
  ------------------
 1991|   351k|      out[0] = u[0];
 1992|   351k|      out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), u[8]);
 1993|   351k|      out[2] = u[12];
 1994|   351k|      out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), u[4]);
 1995|   351k|      out[4] = u[6];
 1996|   351k|      out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), u[14]);
 1997|   351k|      out[6] = u[10];
 1998|   351k|      out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), u[2]);
 1999|   351k|      out[8] = u[3];
 2000|   351k|      out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), u[11]);
 2001|   351k|      out[10] = u[15];
 2002|   351k|      out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), u[7]);
 2003|   351k|      out[12] = u[5];
 2004|   351k|      out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), u[13]);
 2005|   351k|      out[14] = u[9];
 2006|   351k|      out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), u[1]);
 2007|   351k|    } else {
 2008|   340k|      const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|   340k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 340k]
  |  |  ------------------
  ------------------
 2009|   340k|      const __m256i clamp_lo_out =
 2010|   340k|          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
 2011|   340k|      const __m256i clamp_hi_out =
 2012|   340k|          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
 2013|       |
 2014|   340k|      neg_shift_avx2(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
 2015|   340k|                     out_shift);
 2016|   340k|      neg_shift_avx2(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
 2017|   340k|                     &clamp_hi_out, out_shift);
 2018|   340k|      neg_shift_avx2(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
 2019|   340k|                     &clamp_hi_out, out_shift);
 2020|   340k|      neg_shift_avx2(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
 2021|   340k|                     &clamp_hi_out, out_shift);
 2022|   340k|      neg_shift_avx2(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
 2023|   340k|                     &clamp_hi_out, out_shift);
 2024|   340k|      neg_shift_avx2(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
 2025|   340k|                     &clamp_hi_out, out_shift);
 2026|   340k|      neg_shift_avx2(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
 2027|   340k|                     &clamp_hi_out, out_shift);
 2028|   340k|      neg_shift_avx2(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
 2029|   340k|                     &clamp_hi_out, out_shift);
 2030|   340k|    }
 2031|   692k|  }
 2032|   692k|}
highbd_inv_txfm_avx2.c:iadst16_avx2:
 2035|   296k|                         int bd, int out_shift) {
 2036|   296k|  const int32_t *cospi = cospi_arr(bit);
 2037|   296k|  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
 2038|   296k|  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
 2039|   296k|  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
 2040|   296k|  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
 2041|   296k|  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
 2042|   296k|  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
 2043|   296k|  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
 2044|   296k|  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
 2045|   296k|  const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
 2046|   296k|  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
 2047|   296k|  const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
 2048|   296k|  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
 2049|   296k|  const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
 2050|   296k|  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
 2051|   296k|  const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
 2052|   296k|  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
 2053|   296k|  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
 2054|   296k|  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
 2055|   296k|  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
 2056|   296k|  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
 2057|   296k|  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
 2058|   296k|  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
 2059|   296k|  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
 2060|   296k|  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
 2061|   296k|  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
 2062|   296k|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
 2063|   296k|  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
 2064|   296k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|   592k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 296k]
  |  |  |  Branch (35:31): [True: 113k, False: 183k]
  |  |  |  Branch (35:44): [True: 113k, False: 183k]
  |  |  ------------------
  ------------------
 2065|   296k|  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
 2066|   296k|  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
 2067|   296k|  __m256i u[16], v[16], x, y;
 2068|       |
 2069|   296k|  {
 2070|       |    // stage 0
 2071|       |    // stage 1
 2072|       |    // stage 2
 2073|   296k|    v[0] = _mm256_mullo_epi32(in[15], cospi2);
 2074|   296k|    x = _mm256_mullo_epi32(in[0], cospi62);
 2075|   296k|    v[0] = _mm256_add_epi32(v[0], x);
 2076|   296k|    v[0] = _mm256_add_epi32(v[0], rnding);
 2077|   296k|    v[0] = _mm256_srai_epi32(v[0], bit);
 2078|       |
 2079|   296k|    v[1] = _mm256_mullo_epi32(in[15], cospi62);
 2080|   296k|    x = _mm256_mullo_epi32(in[0], cospi2);
 2081|   296k|    v[1] = _mm256_sub_epi32(v[1], x);
 2082|   296k|    v[1] = _mm256_add_epi32(v[1], rnding);
 2083|   296k|    v[1] = _mm256_srai_epi32(v[1], bit);
 2084|       |
 2085|   296k|    v[2] = _mm256_mullo_epi32(in[13], cospi10);
 2086|   296k|    x = _mm256_mullo_epi32(in[2], cospi54);
 2087|   296k|    v[2] = _mm256_add_epi32(v[2], x);
 2088|   296k|    v[2] = _mm256_add_epi32(v[2], rnding);
 2089|   296k|    v[2] = _mm256_srai_epi32(v[2], bit);
 2090|       |
 2091|   296k|    v[3] = _mm256_mullo_epi32(in[13], cospi54);
 2092|   296k|    x = _mm256_mullo_epi32(in[2], cospi10);
 2093|   296k|    v[3] = _mm256_sub_epi32(v[3], x);
 2094|   296k|    v[3] = _mm256_add_epi32(v[3], rnding);
 2095|   296k|    v[3] = _mm256_srai_epi32(v[3], bit);
 2096|       |
 2097|   296k|    v[4] = _mm256_mullo_epi32(in[11], cospi18);
 2098|   296k|    x = _mm256_mullo_epi32(in[4], cospi46);
 2099|   296k|    v[4] = _mm256_add_epi32(v[4], x);
 2100|   296k|    v[4] = _mm256_add_epi32(v[4], rnding);
 2101|   296k|    v[4] = _mm256_srai_epi32(v[4], bit);
 2102|       |
 2103|   296k|    v[5] = _mm256_mullo_epi32(in[11], cospi46);
 2104|   296k|    x = _mm256_mullo_epi32(in[4], cospi18);
 2105|   296k|    v[5] = _mm256_sub_epi32(v[5], x);
 2106|   296k|    v[5] = _mm256_add_epi32(v[5], rnding);
 2107|   296k|    v[5] = _mm256_srai_epi32(v[5], bit);
 2108|       |
 2109|   296k|    v[6] = _mm256_mullo_epi32(in[9], cospi26);
 2110|   296k|    x = _mm256_mullo_epi32(in[6], cospi38);
 2111|   296k|    v[6] = _mm256_add_epi32(v[6], x);
 2112|   296k|    v[6] = _mm256_add_epi32(v[6], rnding);
 2113|   296k|    v[6] = _mm256_srai_epi32(v[6], bit);
 2114|       |
 2115|   296k|    v[7] = _mm256_mullo_epi32(in[9], cospi38);
 2116|   296k|    x = _mm256_mullo_epi32(in[6], cospi26);
 2117|   296k|    v[7] = _mm256_sub_epi32(v[7], x);
 2118|   296k|    v[7] = _mm256_add_epi32(v[7], rnding);
 2119|   296k|    v[7] = _mm256_srai_epi32(v[7], bit);
 2120|       |
 2121|   296k|    v[8] = _mm256_mullo_epi32(in[7], cospi34);
 2122|   296k|    x = _mm256_mullo_epi32(in[8], cospi30);
 2123|   296k|    v[8] = _mm256_add_epi32(v[8], x);
 2124|   296k|    v[8] = _mm256_add_epi32(v[8], rnding);
 2125|   296k|    v[8] = _mm256_srai_epi32(v[8], bit);
 2126|       |
 2127|   296k|    v[9] = _mm256_mullo_epi32(in[7], cospi30);
 2128|   296k|    x = _mm256_mullo_epi32(in[8], cospi34);
 2129|   296k|    v[9] = _mm256_sub_epi32(v[9], x);
 2130|   296k|    v[9] = _mm256_add_epi32(v[9], rnding);
 2131|   296k|    v[9] = _mm256_srai_epi32(v[9], bit);
 2132|       |
 2133|   296k|    v[10] = _mm256_mullo_epi32(in[5], cospi42);
 2134|   296k|    x = _mm256_mullo_epi32(in[10], cospi22);
 2135|   296k|    v[10] = _mm256_add_epi32(v[10], x);
 2136|   296k|    v[10] = _mm256_add_epi32(v[10], rnding);
 2137|   296k|    v[10] = _mm256_srai_epi32(v[10], bit);
 2138|       |
 2139|   296k|    v[11] = _mm256_mullo_epi32(in[5], cospi22);
 2140|   296k|    x = _mm256_mullo_epi32(in[10], cospi42);
 2141|   296k|    v[11] = _mm256_sub_epi32(v[11], x);
 2142|   296k|    v[11] = _mm256_add_epi32(v[11], rnding);
 2143|   296k|    v[11] = _mm256_srai_epi32(v[11], bit);
 2144|       |
 2145|   296k|    v[12] = _mm256_mullo_epi32(in[3], cospi50);
 2146|   296k|    x = _mm256_mullo_epi32(in[12], cospi14);
 2147|   296k|    v[12] = _mm256_add_epi32(v[12], x);
 2148|   296k|    v[12] = _mm256_add_epi32(v[12], rnding);
 2149|   296k|    v[12] = _mm256_srai_epi32(v[12], bit);
 2150|       |
 2151|   296k|    v[13] = _mm256_mullo_epi32(in[3], cospi14);
 2152|   296k|    x = _mm256_mullo_epi32(in[12], cospi50);
 2153|   296k|    v[13] = _mm256_sub_epi32(v[13], x);
 2154|   296k|    v[13] = _mm256_add_epi32(v[13], rnding);
 2155|   296k|    v[13] = _mm256_srai_epi32(v[13], bit);
 2156|       |
 2157|   296k|    v[14] = _mm256_mullo_epi32(in[1], cospi58);
 2158|   296k|    x = _mm256_mullo_epi32(in[14], cospi6);
 2159|   296k|    v[14] = _mm256_add_epi32(v[14], x);
 2160|   296k|    v[14] = _mm256_add_epi32(v[14], rnding);
 2161|   296k|    v[14] = _mm256_srai_epi32(v[14], bit);
 2162|       |
 2163|   296k|    v[15] = _mm256_mullo_epi32(in[1], cospi6);
 2164|   296k|    x = _mm256_mullo_epi32(in[14], cospi58);
 2165|   296k|    v[15] = _mm256_sub_epi32(v[15], x);
 2166|   296k|    v[15] = _mm256_add_epi32(v[15], rnding);
 2167|   296k|    v[15] = _mm256_srai_epi32(v[15], bit);
 2168|       |
 2169|       |    // stage 3
 2170|   296k|    addsub_avx2(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
 2171|   296k|    addsub_avx2(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
 2172|   296k|    addsub_avx2(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
 2173|   296k|    addsub_avx2(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
 2174|   296k|    addsub_avx2(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
 2175|   296k|    addsub_avx2(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
 2176|   296k|    addsub_avx2(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
 2177|   296k|    addsub_avx2(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
 2178|       |
 2179|       |    // stage 4
 2180|   296k|    v[0] = u[0];
 2181|   296k|    v[1] = u[1];
 2182|   296k|    v[2] = u[2];
 2183|   296k|    v[3] = u[3];
 2184|   296k|    v[4] = u[4];
 2185|   296k|    v[5] = u[5];
 2186|   296k|    v[6] = u[6];
 2187|   296k|    v[7] = u[7];
 2188|       |
 2189|   296k|    v[8] = _mm256_mullo_epi32(u[8], cospi8);
 2190|   296k|    x = _mm256_mullo_epi32(u[9], cospi56);
 2191|   296k|    v[8] = _mm256_add_epi32(v[8], x);
 2192|   296k|    v[8] = _mm256_add_epi32(v[8], rnding);
 2193|   296k|    v[8] = _mm256_srai_epi32(v[8], bit);
 2194|       |
 2195|   296k|    v[9] = _mm256_mullo_epi32(u[8], cospi56);
 2196|   296k|    x = _mm256_mullo_epi32(u[9], cospi8);
 2197|   296k|    v[9] = _mm256_sub_epi32(v[9], x);
 2198|   296k|    v[9] = _mm256_add_epi32(v[9], rnding);
 2199|   296k|    v[9] = _mm256_srai_epi32(v[9], bit);
 2200|       |
 2201|   296k|    v[10] = _mm256_mullo_epi32(u[10], cospi40);
 2202|   296k|    x = _mm256_mullo_epi32(u[11], cospi24);
 2203|   296k|    v[10] = _mm256_add_epi32(v[10], x);
 2204|   296k|    v[10] = _mm256_add_epi32(v[10], rnding);
 2205|   296k|    v[10] = _mm256_srai_epi32(v[10], bit);
 2206|       |
 2207|   296k|    v[11] = _mm256_mullo_epi32(u[10], cospi24);
 2208|   296k|    x = _mm256_mullo_epi32(u[11], cospi40);
 2209|   296k|    v[11] = _mm256_sub_epi32(v[11], x);
 2210|   296k|    v[11] = _mm256_add_epi32(v[11], rnding);
 2211|   296k|    v[11] = _mm256_srai_epi32(v[11], bit);
 2212|       |
 2213|   296k|    v[12] = _mm256_mullo_epi32(u[12], cospim56);
 2214|   296k|    x = _mm256_mullo_epi32(u[13], cospi8);
 2215|   296k|    v[12] = _mm256_add_epi32(v[12], x);
 2216|   296k|    v[12] = _mm256_add_epi32(v[12], rnding);
 2217|   296k|    v[12] = _mm256_srai_epi32(v[12], bit);
 2218|       |
 2219|   296k|    v[13] = _mm256_mullo_epi32(u[12], cospi8);
 2220|   296k|    x = _mm256_mullo_epi32(u[13], cospim56);
 2221|   296k|    v[13] = _mm256_sub_epi32(v[13], x);
 2222|   296k|    v[13] = _mm256_add_epi32(v[13], rnding);
 2223|   296k|    v[13] = _mm256_srai_epi32(v[13], bit);
 2224|       |
 2225|   296k|    v[14] = _mm256_mullo_epi32(u[14], cospim24);
 2226|   296k|    x = _mm256_mullo_epi32(u[15], cospi40);
 2227|   296k|    v[14] = _mm256_add_epi32(v[14], x);
 2228|   296k|    v[14] = _mm256_add_epi32(v[14], rnding);
 2229|   296k|    v[14] = _mm256_srai_epi32(v[14], bit);
 2230|       |
 2231|   296k|    v[15] = _mm256_mullo_epi32(u[14], cospi40);
 2232|   296k|    x = _mm256_mullo_epi32(u[15], cospim24);
 2233|   296k|    v[15] = _mm256_sub_epi32(v[15], x);
 2234|   296k|    v[15] = _mm256_add_epi32(v[15], rnding);
 2235|   296k|    v[15] = _mm256_srai_epi32(v[15], bit);
 2236|       |
 2237|       |    // stage 5
 2238|   296k|    addsub_avx2(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
 2239|   296k|    addsub_avx2(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
 2240|   296k|    addsub_avx2(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
 2241|   296k|    addsub_avx2(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
 2242|   296k|    addsub_avx2(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
 2243|   296k|    addsub_avx2(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
 2244|   296k|    addsub_avx2(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
 2245|   296k|    addsub_avx2(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
 2246|       |
 2247|       |    // stage 6
 2248|   296k|    v[0] = u[0];
 2249|   296k|    v[1] = u[1];
 2250|   296k|    v[2] = u[2];
 2251|   296k|    v[3] = u[3];
 2252|       |
 2253|   296k|    v[4] = _mm256_mullo_epi32(u[4], cospi16);
 2254|   296k|    x = _mm256_mullo_epi32(u[5], cospi48);
 2255|   296k|    v[4] = _mm256_add_epi32(v[4], x);
 2256|   296k|    v[4] = _mm256_add_epi32(v[4], rnding);
 2257|   296k|    v[4] = _mm256_srai_epi32(v[4], bit);
 2258|       |
 2259|   296k|    v[5] = _mm256_mullo_epi32(u[4], cospi48);
 2260|   296k|    x = _mm256_mullo_epi32(u[5], cospi16);
 2261|   296k|    v[5] = _mm256_sub_epi32(v[5], x);
 2262|   296k|    v[5] = _mm256_add_epi32(v[5], rnding);
 2263|   296k|    v[5] = _mm256_srai_epi32(v[5], bit);
 2264|       |
 2265|   296k|    v[6] = _mm256_mullo_epi32(u[6], cospim48);
 2266|   296k|    x = _mm256_mullo_epi32(u[7], cospi16);
 2267|   296k|    v[6] = _mm256_add_epi32(v[6], x);
 2268|   296k|    v[6] = _mm256_add_epi32(v[6], rnding);
 2269|   296k|    v[6] = _mm256_srai_epi32(v[6], bit);
 2270|       |
 2271|   296k|    v[7] = _mm256_mullo_epi32(u[6], cospi16);
 2272|   296k|    x = _mm256_mullo_epi32(u[7], cospim48);
 2273|   296k|    v[7] = _mm256_sub_epi32(v[7], x);
 2274|   296k|    v[7] = _mm256_add_epi32(v[7], rnding);
 2275|   296k|    v[7] = _mm256_srai_epi32(v[7], bit);
 2276|       |
 2277|   296k|    v[8] = u[8];
 2278|   296k|    v[9] = u[9];
 2279|   296k|    v[10] = u[10];
 2280|   296k|    v[11] = u[11];
 2281|       |
 2282|   296k|    v[12] = _mm256_mullo_epi32(u[12], cospi16);
 2283|   296k|    x = _mm256_mullo_epi32(u[13], cospi48);
 2284|   296k|    v[12] = _mm256_add_epi32(v[12], x);
 2285|   296k|    v[12] = _mm256_add_epi32(v[12], rnding);
 2286|   296k|    v[12] = _mm256_srai_epi32(v[12], bit);
 2287|       |
 2288|   296k|    v[13] = _mm256_mullo_epi32(u[12], cospi48);
 2289|   296k|    x = _mm256_mullo_epi32(u[13], cospi16);
 2290|   296k|    v[13] = _mm256_sub_epi32(v[13], x);
 2291|   296k|    v[13] = _mm256_add_epi32(v[13], rnding);
 2292|   296k|    v[13] = _mm256_srai_epi32(v[13], bit);
 2293|       |
 2294|   296k|    v[14] = _mm256_mullo_epi32(u[14], cospim48);
 2295|   296k|    x = _mm256_mullo_epi32(u[15], cospi16);
 2296|   296k|    v[14] = _mm256_add_epi32(v[14], x);
 2297|   296k|    v[14] = _mm256_add_epi32(v[14], rnding);
 2298|   296k|    v[14] = _mm256_srai_epi32(v[14], bit);
 2299|       |
 2300|   296k|    v[15] = _mm256_mullo_epi32(u[14], cospi16);
 2301|   296k|    x = _mm256_mullo_epi32(u[15], cospim48);
 2302|   296k|    v[15] = _mm256_sub_epi32(v[15], x);
 2303|   296k|    v[15] = _mm256_add_epi32(v[15], rnding);
 2304|   296k|    v[15] = _mm256_srai_epi32(v[15], bit);
 2305|       |
 2306|       |    // stage 7
 2307|   296k|    addsub_avx2(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
 2308|   296k|    addsub_avx2(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
 2309|   296k|    addsub_avx2(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
 2310|   296k|    addsub_avx2(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
 2311|   296k|    addsub_avx2(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
 2312|   296k|    addsub_avx2(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
 2313|   296k|    addsub_avx2(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
 2314|   296k|    addsub_avx2(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
 2315|       |
 2316|       |    // stage 8
 2317|   296k|    v[0] = u[0];
 2318|   296k|    v[1] = u[1];
 2319|       |
 2320|   296k|    y = _mm256_mullo_epi32(u[2], cospi32);
 2321|   296k|    x = _mm256_mullo_epi32(u[3], cospi32);
 2322|   296k|    v[2] = _mm256_add_epi32(y, x);
 2323|   296k|    v[2] = _mm256_add_epi32(v[2], rnding);
 2324|   296k|    v[2] = _mm256_srai_epi32(v[2], bit);
 2325|       |
 2326|   296k|    v[3] = _mm256_sub_epi32(y, x);
 2327|   296k|    v[3] = _mm256_add_epi32(v[3], rnding);
 2328|   296k|    v[3] = _mm256_srai_epi32(v[3], bit);
 2329|       |
 2330|   296k|    v[4] = u[4];
 2331|   296k|    v[5] = u[5];
 2332|       |
 2333|   296k|    y = _mm256_mullo_epi32(u[6], cospi32);
 2334|   296k|    x = _mm256_mullo_epi32(u[7], cospi32);
 2335|   296k|    v[6] = _mm256_add_epi32(y, x);
 2336|   296k|    v[6] = _mm256_add_epi32(v[6], rnding);
 2337|   296k|    v[6] = _mm256_srai_epi32(v[6], bit);
 2338|       |
 2339|   296k|    v[7] = _mm256_sub_epi32(y, x);
 2340|   296k|    v[7] = _mm256_add_epi32(v[7], rnding);
 2341|   296k|    v[7] = _mm256_srai_epi32(v[7], bit);
 2342|       |
 2343|   296k|    v[8] = u[8];
 2344|   296k|    v[9] = u[9];
 2345|       |
 2346|   296k|    y = _mm256_mullo_epi32(u[10], cospi32);
 2347|   296k|    x = _mm256_mullo_epi32(u[11], cospi32);
 2348|   296k|    v[10] = _mm256_add_epi32(y, x);
 2349|   296k|    v[10] = _mm256_add_epi32(v[10], rnding);
 2350|   296k|    v[10] = _mm256_srai_epi32(v[10], bit);
 2351|       |
 2352|   296k|    v[11] = _mm256_sub_epi32(y, x);
 2353|   296k|    v[11] = _mm256_add_epi32(v[11], rnding);
 2354|   296k|    v[11] = _mm256_srai_epi32(v[11], bit);
 2355|       |
 2356|   296k|    v[12] = u[12];
 2357|   296k|    v[13] = u[13];
 2358|       |
 2359|   296k|    y = _mm256_mullo_epi32(u[14], cospi32);
 2360|   296k|    x = _mm256_mullo_epi32(u[15], cospi32);
 2361|   296k|    v[14] = _mm256_add_epi32(y, x);
 2362|   296k|    v[14] = _mm256_add_epi32(v[14], rnding);
 2363|   296k|    v[14] = _mm256_srai_epi32(v[14], bit);
 2364|       |
 2365|   296k|    v[15] = _mm256_sub_epi32(y, x);
 2366|   296k|    v[15] = _mm256_add_epi32(v[15], rnding);
 2367|   296k|    v[15] = _mm256_srai_epi32(v[15], bit);
 2368|       |
 2369|       |    // stage 9
 2370|   296k|    if (do_cols) {
  ------------------
  |  Branch (2370:9): [True: 113k, False: 183k]
  ------------------
 2371|   113k|      out[0] = v[0];
 2372|   113k|      out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]);
 2373|   113k|      out[2] = v[12];
 2374|   113k|      out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]);
 2375|   113k|      out[4] = v[6];
 2376|   113k|      out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]);
 2377|   113k|      out[6] = v[10];
 2378|   113k|      out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]);
 2379|   113k|      out[8] = v[3];
 2380|   113k|      out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]);
 2381|   113k|      out[10] = v[15];
 2382|   113k|      out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]);
 2383|   113k|      out[12] = v[5];
 2384|   113k|      out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]);
 2385|   113k|      out[14] = v[9];
 2386|   113k|      out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]);
 2387|   183k|    } else {
 2388|   183k|      const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|   183k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 183k]
  |  |  ------------------
  ------------------
 2389|   183k|      const __m256i clamp_lo_out =
 2390|   183k|          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
 2391|   183k|      const __m256i clamp_hi_out =
 2392|   183k|          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
 2393|       |
 2394|   183k|      neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
 2395|   183k|                     out_shift);
 2396|   183k|      neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
 2397|   183k|                     &clamp_hi_out, out_shift);
 2398|   183k|      neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
 2399|   183k|                     &clamp_hi_out, out_shift);
 2400|   183k|      neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
 2401|   183k|                     &clamp_hi_out, out_shift);
 2402|   183k|      neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
 2403|   183k|                     &clamp_hi_out, out_shift);
 2404|   183k|      neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
 2405|   183k|                     &clamp_hi_out, out_shift);
 2406|   183k|      neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
 2407|   183k|                     &clamp_hi_out, out_shift);
 2408|   183k|      neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
 2409|   183k|                     &clamp_hi_out, out_shift);
 2410|   183k|    }
 2411|   296k|  }
 2412|   296k|}
highbd_inv_txfm_avx2.c:idct32_low1_avx2:
  443|  1.15M|                             int bd, int out_shift) {
  444|  1.15M|  const int32_t *cospi = cospi_arr(bit);
  445|  1.15M|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
  446|  1.15M|  const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
  447|  1.15M|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|  2.31M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 1.15M]
  |  |  |  Branch (35:31): [True: 780k, False: 378k]
  |  |  |  Branch (35:44): [True: 780k, False: 378k]
  |  |  ------------------
  ------------------
  448|  1.15M|  __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
  449|  1.15M|  __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
  450|  1.15M|  __m256i x;
  451|       |  // stage 0
  452|       |  // stage 1
  453|       |  // stage 2
  454|       |  // stage 3
  455|       |  // stage 4
  456|       |  // stage 5
  457|  1.15M|  x = _mm256_mullo_epi32(in[0], cospi32);
  458|  1.15M|  x = _mm256_add_epi32(x, rounding);
  459|  1.15M|  x = _mm256_srai_epi32(x, bit);
  460|       |
  461|       |  // stage 6
  462|       |  // stage 7
  463|       |  // stage 8
  464|       |  // stage 9
  465|  1.15M|  if (!do_cols) {
  ------------------
  |  Branch (465:7): [True: 379k, False: 780k]
  ------------------
  466|   379k|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|   379k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 379k]
  |  |  ------------------
  ------------------
  467|   379k|    __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
  468|   379k|    clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
  469|   379k|    clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
  470|   379k|    x = _mm256_add_epi32(offset, x);
  471|   379k|    x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
  472|   379k|  }
  473|  1.15M|  x = _mm256_max_epi32(x, clamp_lo);
  474|  1.15M|  x = _mm256_min_epi32(x, clamp_hi);
  475|  1.15M|  out[0] = x;
  476|  1.15M|  out[1] = x;
  477|  1.15M|  out[2] = x;
  478|  1.15M|  out[3] = x;
  479|  1.15M|  out[4] = x;
  480|  1.15M|  out[5] = x;
  481|  1.15M|  out[6] = x;
  482|  1.15M|  out[7] = x;
  483|  1.15M|  out[8] = x;
  484|  1.15M|  out[9] = x;
  485|  1.15M|  out[10] = x;
  486|  1.15M|  out[11] = x;
  487|  1.15M|  out[12] = x;
  488|  1.15M|  out[13] = x;
  489|  1.15M|  out[14] = x;
  490|  1.15M|  out[15] = x;
  491|  1.15M|  out[16] = x;
  492|  1.15M|  out[17] = x;
  493|  1.15M|  out[18] = x;
  494|  1.15M|  out[19] = x;
  495|  1.15M|  out[20] = x;
  496|  1.15M|  out[21] = x;
  497|  1.15M|  out[22] = x;
  498|  1.15M|  out[23] = x;
  499|  1.15M|  out[24] = x;
  500|  1.15M|  out[25] = x;
  501|  1.15M|  out[26] = x;
  502|  1.15M|  out[27] = x;
  503|  1.15M|  out[28] = x;
  504|  1.15M|  out[29] = x;
  505|  1.15M|  out[30] = x;
  506|  1.15M|  out[31] = x;
  507|  1.15M|}
highbd_inv_txfm_avx2.c:idct32_low8_avx2:
  510|  1.41M|                             int bd, int out_shift) {
  511|  1.41M|  const int32_t *cospi = cospi_arr(bit);
  512|  1.41M|  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
  513|  1.41M|  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
  514|  1.41M|  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
  515|  1.41M|  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
  516|  1.41M|  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
  517|  1.41M|  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
  518|  1.41M|  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
  519|  1.41M|  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
  520|  1.41M|  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
  521|  1.41M|  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
  522|  1.41M|  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
  523|  1.41M|  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
  524|  1.41M|  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
  525|  1.41M|  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
  526|  1.41M|  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
  527|  1.41M|  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
  528|  1.41M|  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
  529|  1.41M|  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
  530|  1.41M|  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
  531|  1.41M|  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
  532|  1.41M|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
  533|  1.41M|  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
  534|  1.41M|  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
  535|  1.41M|  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
  536|  1.41M|  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
  537|  1.41M|  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
  538|  1.41M|  const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
  539|  1.41M|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|  2.83M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 1.41M]
  |  |  |  Branch (35:31): [True: 899k, False: 516k]
  |  |  |  Branch (35:44): [True: 899k, False: 516k]
  |  |  ------------------
  ------------------
  540|  1.41M|  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
  541|  1.41M|  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
  542|  1.41M|  __m256i bf1[32];
  543|       |
  544|  1.41M|  {
  545|       |    // stage 0
  546|       |    // stage 1
  547|  1.41M|    bf1[0] = in[0];
  548|  1.41M|    bf1[4] = in[4];
  549|  1.41M|    bf1[8] = in[2];
  550|  1.41M|    bf1[12] = in[6];
  551|  1.41M|    bf1[16] = in[1];
  552|  1.41M|    bf1[20] = in[5];
  553|  1.41M|    bf1[24] = in[3];
  554|  1.41M|    bf1[28] = in[7];
  555|       |
  556|       |    // stage 2
  557|  1.41M|    bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
  558|  1.41M|    bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
  559|  1.41M|    bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
  560|  1.41M|    bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
  561|  1.41M|    bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
  562|  1.41M|    bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
  563|  1.41M|    bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
  564|  1.41M|    bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
  565|       |
  566|       |    // stage 3
  567|  1.41M|    bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
  568|  1.41M|    bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
  569|       |
  570|  1.41M|    bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
  571|  1.41M|    bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
  572|  1.41M|    bf1[17] = bf1[16];
  573|  1.41M|    bf1[18] = bf1[19];
  574|  1.41M|    bf1[21] = bf1[20];
  575|  1.41M|    bf1[22] = bf1[23];
  576|  1.41M|    bf1[25] = bf1[24];
  577|  1.41M|    bf1[26] = bf1[27];
  578|  1.41M|    bf1[29] = bf1[28];
  579|  1.41M|    bf1[30] = bf1[31];
  580|       |
  581|       |    // stage 4
  582|  1.41M|    bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
  583|  1.41M|    bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
  584|       |
  585|  1.41M|    bf1[9] = bf1[8];
  586|  1.41M|    bf1[10] = bf1[11];
  587|  1.41M|    bf1[13] = bf1[12];
  588|  1.41M|    bf1[14] = bf1[15];
  589|       |
  590|  1.41M|    idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
  591|  1.41M|                       &cospi24, &cospi40, &cospim24, &rounding, bit);
  592|       |
  593|       |    // stage 5
  594|  1.41M|    bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
  595|  1.41M|    bf1[1] = bf1[0];
  596|  1.41M|    bf1[5] = bf1[4];
  597|  1.41M|    bf1[6] = bf1[7];
  598|       |
  599|  1.41M|    idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
  600|  1.41M|                       &clamp_hi, &rounding, bit);
  601|       |
  602|       |    // stage 6
  603|  1.41M|    bf1[3] = bf1[0];
  604|  1.41M|    bf1[2] = bf1[1];
  605|       |
  606|  1.41M|    idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
  607|  1.41M|                       &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
  608|       |
  609|       |    // stage 7
  610|  1.41M|    idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
  611|  1.41M|                       &rounding, bit);
  612|       |
  613|       |    // stage 8
  614|  1.41M|    idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
  615|  1.41M|                       &rounding, bit);
  616|       |
  617|       |    // stage 9
  618|  1.41M|    idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
  619|  1.41M|  }
  620|  1.41M|}
highbd_inv_txfm_avx2.c:idct32_stage4_avx2:
  282|  1.84M|    const __m256i *rounding, int bit) {
  283|  1.84M|  __m256i temp1, temp2;
  284|  1.84M|  temp1 = half_btf_avx2(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
  285|  1.84M|  bf1[30] = half_btf_avx2(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
  286|  1.84M|  bf1[17] = temp1;
  287|       |
  288|  1.84M|  temp2 = half_btf_avx2(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
  289|  1.84M|  bf1[29] = half_btf_avx2(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
  290|  1.84M|  bf1[18] = temp2;
  291|       |
  292|  1.84M|  temp1 = half_btf_avx2(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
  293|  1.84M|  bf1[26] = half_btf_avx2(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
  294|  1.84M|  bf1[21] = temp1;
  295|       |
  296|  1.84M|  temp2 = half_btf_avx2(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
  297|  1.84M|  bf1[25] = half_btf_avx2(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
  298|  1.84M|  bf1[22] = temp2;
  299|  1.84M|}
highbd_inv_txfm_avx2.c:idct32_stage5_avx2:
  304|  1.84M|    const __m256i *clamp_hi, const __m256i *rounding, int bit) {
  305|  1.84M|  __m256i temp1, temp2;
  306|  1.84M|  temp1 = half_btf_avx2(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
  307|  1.84M|  bf1[14] = half_btf_avx2(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
  308|  1.84M|  bf1[9] = temp1;
  309|       |
  310|  1.84M|  temp2 = half_btf_avx2(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
  311|  1.84M|  bf1[13] = half_btf_avx2(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
  312|  1.84M|  bf1[10] = temp2;
  313|       |
  314|  1.84M|  addsub_avx2(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
  315|  1.84M|  addsub_avx2(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
  316|  1.84M|  addsub_avx2(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
  317|  1.84M|  addsub_avx2(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
  318|  1.84M|  addsub_avx2(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
  319|  1.84M|  addsub_avx2(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
  320|  1.84M|  addsub_avx2(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
  321|  1.84M|  addsub_avx2(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
  322|  1.84M|}
highbd_inv_txfm_avx2.c:idct32_stage6_avx2:
  328|  1.84M|    const __m256i *rounding, int bit) {
  329|  1.84M|  __m256i temp1, temp2;
  330|  1.84M|  temp1 = half_btf_avx2(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
  331|  1.84M|  bf1[6] = half_btf_avx2(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
  332|  1.84M|  bf1[5] = temp1;
  333|       |
  334|  1.84M|  addsub_avx2(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
  335|  1.84M|  addsub_avx2(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
  336|  1.84M|  addsub_avx2(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
  337|  1.84M|  addsub_avx2(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
  338|       |
  339|  1.84M|  temp1 = half_btf_avx2(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
  340|  1.84M|  bf1[29] = half_btf_avx2(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
  341|  1.84M|  bf1[18] = temp1;
  342|  1.84M|  temp2 = half_btf_avx2(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
  343|  1.84M|  bf1[28] = half_btf_avx2(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
  344|  1.84M|  bf1[19] = temp2;
  345|  1.84M|  temp1 = half_btf_avx2(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
  346|  1.84M|  bf1[27] = half_btf_avx2(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
  347|  1.84M|  bf1[20] = temp1;
  348|  1.84M|  temp2 = half_btf_avx2(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
  349|  1.84M|  bf1[26] = half_btf_avx2(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
  350|  1.84M|  bf1[21] = temp2;
  351|  1.84M|}
highbd_inv_txfm_avx2.c:idct32_stage7_avx2:
  357|  1.84M|                                      const __m256i *rounding, int bit) {
  358|  1.84M|  __m256i temp1, temp2;
  359|  1.84M|  addsub_avx2(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
  360|  1.84M|  addsub_avx2(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
  361|  1.84M|  addsub_avx2(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
  362|  1.84M|  addsub_avx2(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
  363|       |
  364|  1.84M|  temp1 = half_btf_avx2(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
  365|  1.84M|  bf1[13] = half_btf_avx2(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
  366|  1.84M|  bf1[10] = temp1;
  367|  1.84M|  temp2 = half_btf_avx2(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
  368|  1.84M|  bf1[12] = half_btf_avx2(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
  369|  1.84M|  bf1[11] = temp2;
  370|       |
  371|  1.84M|  addsub_avx2(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
  372|  1.84M|  addsub_avx2(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
  373|  1.84M|  addsub_avx2(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
  374|  1.84M|  addsub_avx2(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
  375|  1.84M|  addsub_avx2(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
  376|  1.84M|  addsub_avx2(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
  377|  1.84M|  addsub_avx2(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
  378|  1.84M|  addsub_avx2(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
  379|  1.84M|}
highbd_inv_txfm_avx2.c:idct32_stage8_avx2:
  385|  1.84M|                                      const __m256i *rounding, int bit) {
  386|  1.84M|  __m256i temp1, temp2;
  387|  1.84M|  addsub_avx2(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
  388|  1.84M|  addsub_avx2(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
  389|  1.84M|  addsub_avx2(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
  390|  1.84M|  addsub_avx2(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
  391|  1.84M|  addsub_avx2(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
  392|  1.84M|  addsub_avx2(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
  393|  1.84M|  addsub_avx2(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
  394|  1.84M|  addsub_avx2(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
  395|       |
  396|  1.84M|  temp1 = half_btf_avx2(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
  397|  1.84M|  bf1[27] = half_btf_avx2(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
  398|  1.84M|  bf1[20] = temp1;
  399|  1.84M|  temp2 = half_btf_avx2(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
  400|  1.84M|  bf1[26] = half_btf_avx2(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
  401|  1.84M|  bf1[21] = temp2;
  402|  1.84M|  temp1 = half_btf_avx2(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
  403|  1.84M|  bf1[25] = half_btf_avx2(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
  404|  1.84M|  bf1[22] = temp1;
  405|  1.84M|  temp2 = half_btf_avx2(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
  406|  1.84M|  bf1[24] = half_btf_avx2(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
  407|  1.84M|  bf1[23] = temp2;
  408|  1.84M|}
highbd_inv_txfm_avx2.c:idct32_stage9_avx2:
  414|  1.84M|                                      const __m256i *clamp_hi) {
  415|  1.84M|  addsub_avx2(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi);
  416|  1.84M|  addsub_avx2(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi);
  417|  1.84M|  addsub_avx2(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi);
  418|  1.84M|  addsub_avx2(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi);
  419|  1.84M|  addsub_avx2(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi);
  420|  1.84M|  addsub_avx2(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi);
  421|  1.84M|  addsub_avx2(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi);
  422|  1.84M|  addsub_avx2(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi);
  423|  1.84M|  addsub_avx2(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi);
  424|  1.84M|  addsub_avx2(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi);
  425|  1.84M|  addsub_avx2(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi);
  426|  1.84M|  addsub_avx2(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi);
  427|  1.84M|  addsub_avx2(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi);
  428|  1.84M|  addsub_avx2(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi);
  429|  1.84M|  addsub_avx2(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi);
  430|  1.84M|  addsub_avx2(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi);
  431|  1.84M|  if (!do_cols) {
  ------------------
  |  Branch (431:7): [True: 725k, False: 1.11M]
  ------------------
  432|   725k|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|   725k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 725k]
  |  |  ------------------
  ------------------
  433|   725k|    const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
  434|   725k|    const __m256i clamp_hi_out =
  435|   725k|        _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
  436|   725k|    round_shift_8x8_avx2(out, out_shift);
  437|   725k|    round_shift_8x8_avx2(out + 16, out_shift);
  438|   725k|    highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32);
  439|   725k|  }
  440|  1.84M|}
highbd_inv_txfm_avx2.c:idct32_low16_avx2:
  623|   428k|                              int bd, int out_shift) {
  624|   428k|  const int32_t *cospi = cospi_arr(bit);
  625|   428k|  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
  626|   428k|  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
  627|   428k|  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
  628|   428k|  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
  629|   428k|  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
  630|   428k|  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
  631|   428k|  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
  632|   428k|  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
  633|   428k|  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
  634|   428k|  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
  635|   428k|  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
  636|   428k|  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
  637|   428k|  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
  638|   428k|  const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
  639|   428k|  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
  640|   428k|  const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
  641|   428k|  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
  642|   428k|  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
  643|   428k|  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
  644|   428k|  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
  645|   428k|  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
  646|   428k|  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
  647|   428k|  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
  648|   428k|  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
  649|   428k|  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
  650|   428k|  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
  651|   428k|  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
  652|   428k|  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
  653|   428k|  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
  654|   428k|  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
  655|   428k|  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
  656|   428k|  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
  657|   428k|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
  658|   428k|  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
  659|   428k|  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
  660|   428k|  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
  661|   428k|  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
  662|   428k|  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
  663|   428k|  const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
  664|   428k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|   857k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 428k]
  |  |  |  Branch (35:31): [True: 220k, False: 208k]
  |  |  |  Branch (35:44): [True: 220k, False: 208k]
  |  |  ------------------
  ------------------
  665|   428k|  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
  666|   428k|  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
  667|   428k|  __m256i bf1[32];
  668|       |
  669|   428k|  {
  670|       |    // stage 0
  671|       |    // stage 1
  672|   428k|    bf1[0] = in[0];
  673|   428k|    bf1[2] = in[8];
  674|   428k|    bf1[4] = in[4];
  675|   428k|    bf1[6] = in[12];
  676|   428k|    bf1[8] = in[2];
  677|   428k|    bf1[10] = in[10];
  678|   428k|    bf1[12] = in[6];
  679|   428k|    bf1[14] = in[14];
  680|   428k|    bf1[16] = in[1];
  681|   428k|    bf1[18] = in[9];
  682|   428k|    bf1[20] = in[5];
  683|   428k|    bf1[22] = in[13];
  684|   428k|    bf1[24] = in[3];
  685|   428k|    bf1[26] = in[11];
  686|   428k|    bf1[28] = in[7];
  687|   428k|    bf1[30] = in[15];
  688|       |
  689|       |    // stage 2
  690|   428k|    bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
  691|   428k|    bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
  692|   428k|    bf1[17] = half_btf_0_avx2(&cospim34, &bf1[30], &rounding, bit);
  693|   428k|    bf1[30] = half_btf_0_avx2(&cospi30, &bf1[30], &rounding, bit);
  694|   428k|    bf1[29] = half_btf_0_avx2(&cospi18, &bf1[18], &rounding, bit);
  695|   428k|    bf1[18] = half_btf_0_avx2(&cospi46, &bf1[18], &rounding, bit);
  696|   428k|    bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
  697|   428k|    bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
  698|   428k|    bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
  699|   428k|    bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
  700|   428k|    bf1[21] = half_btf_0_avx2(&cospim42, &bf1[26], &rounding, bit);
  701|   428k|    bf1[26] = half_btf_0_avx2(&cospi22, &bf1[26], &rounding, bit);
  702|   428k|    bf1[25] = half_btf_0_avx2(&cospi26, &bf1[22], &rounding, bit);
  703|   428k|    bf1[22] = half_btf_0_avx2(&cospi38, &bf1[22], &rounding, bit);
  704|   428k|    bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
  705|   428k|    bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
  706|       |
  707|       |    // stage 3
  708|   428k|    bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
  709|   428k|    bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
  710|   428k|    bf1[9] = half_btf_0_avx2(&cospim36, &bf1[14], &rounding, bit);
  711|   428k|    bf1[14] = half_btf_0_avx2(&cospi28, &bf1[14], &rounding, bit);
  712|   428k|    bf1[13] = half_btf_0_avx2(&cospi20, &bf1[10], &rounding, bit);
  713|   428k|    bf1[10] = half_btf_0_avx2(&cospi44, &bf1[10], &rounding, bit);
  714|   428k|    bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
  715|   428k|    bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
  716|       |
  717|   428k|    addsub_avx2(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
  718|   428k|    addsub_avx2(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
  719|   428k|    addsub_avx2(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
  720|   428k|    addsub_avx2(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
  721|   428k|    addsub_avx2(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
  722|   428k|    addsub_avx2(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
  723|   428k|    addsub_avx2(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
  724|   428k|    addsub_avx2(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
  725|       |
  726|       |    // stage 4
  727|   428k|    bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
  728|   428k|    bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
  729|   428k|    bf1[5] = half_btf_0_avx2(&cospim40, &bf1[6], &rounding, bit);
  730|   428k|    bf1[6] = half_btf_0_avx2(&cospi24, &bf1[6], &rounding, bit);
  731|       |
  732|   428k|    addsub_avx2(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
  733|   428k|    addsub_avx2(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
  734|   428k|    addsub_avx2(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
  735|   428k|    addsub_avx2(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
  736|       |
  737|   428k|    idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
  738|   428k|                       &cospi24, &cospi40, &cospim24, &rounding, bit);
  739|       |
  740|       |    // stage 5
  741|   428k|    bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
  742|   428k|    bf1[1] = bf1[0];
  743|   428k|    bf1[3] = half_btf_0_avx2(&cospi16, &bf1[2], &rounding, bit);
  744|   428k|    bf1[2] = half_btf_0_avx2(&cospi48, &bf1[2], &rounding, bit);
  745|       |
  746|   428k|    addsub_avx2(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
  747|   428k|    addsub_avx2(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
  748|       |
  749|   428k|    idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
  750|   428k|                       &clamp_hi, &rounding, bit);
  751|       |
  752|       |    // stage 6
  753|   428k|    addsub_avx2(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
  754|   428k|    addsub_avx2(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
  755|       |
  756|   428k|    idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
  757|   428k|                       &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
  758|       |
  759|       |    // stage 7
  760|   428k|    idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
  761|   428k|                       &rounding, bit);
  762|       |
  763|       |    // stage 8
  764|   428k|    idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
  765|   428k|                       &rounding, bit);
  766|       |
  767|       |    // stage 9
  768|   428k|    idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
  769|   428k|  }
  770|   428k|}
highbd_inv_txfm_avx2.c:idct32_avx2:
  773|   335k|                        int out_shift) {
  774|   335k|  const int32_t *cospi = cospi_arr(bit);
  775|   335k|  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
  776|   335k|  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
  777|   335k|  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
  778|   335k|  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
  779|   335k|  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
  780|   335k|  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
  781|   335k|  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
  782|   335k|  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
  783|   335k|  const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
  784|   335k|  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
  785|   335k|  const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
  786|   335k|  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
  787|   335k|  const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
  788|   335k|  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
  789|   335k|  const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
  790|   335k|  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
  791|   335k|  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
  792|   335k|  const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
  793|   335k|  const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
  794|   335k|  const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
  795|   335k|  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
  796|   335k|  const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
  797|   335k|  const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
  798|   335k|  const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
  799|   335k|  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
  800|   335k|  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
  801|   335k|  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
  802|   335k|  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
  803|   335k|  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
  804|   335k|  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
  805|   335k|  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
  806|   335k|  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
  807|   335k|  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
  808|   335k|  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
  809|   335k|  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
  810|   335k|  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
  811|   335k|  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
  812|   335k|  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
  813|   335k|  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
  814|   335k|  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
  815|   335k|  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
  816|   335k|  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
  817|   335k|  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
  818|   335k|  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
  819|   335k|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
  820|   335k|  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
  821|   335k|  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
  822|   335k|  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
  823|   335k|  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
  824|   335k|  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
  825|   335k|  const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
  826|   335k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|   671k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 335k]
  |  |  |  Branch (35:31): [True: 166k, False: 169k]
  |  |  |  Branch (35:44): [True: 166k, False: 169k]
  |  |  ------------------
  ------------------
  827|   335k|  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
  828|   335k|  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
  829|   335k|  __m256i bf1[32], bf0[32];
  830|       |
  831|   335k|  {
  832|       |    // stage 0
  833|       |    // stage 1
  834|   335k|    bf1[0] = in[0];
  835|   335k|    bf1[1] = in[16];
  836|   335k|    bf1[2] = in[8];
  837|   335k|    bf1[3] = in[24];
  838|   335k|    bf1[4] = in[4];
  839|   335k|    bf1[5] = in[20];
  840|   335k|    bf1[6] = in[12];
  841|   335k|    bf1[7] = in[28];
  842|   335k|    bf1[8] = in[2];
  843|   335k|    bf1[9] = in[18];
  844|   335k|    bf1[10] = in[10];
  845|   335k|    bf1[11] = in[26];
  846|   335k|    bf1[12] = in[6];
  847|   335k|    bf1[13] = in[22];
  848|   335k|    bf1[14] = in[14];
  849|   335k|    bf1[15] = in[30];
  850|   335k|    bf1[16] = in[1];
  851|   335k|    bf1[17] = in[17];
  852|   335k|    bf1[18] = in[9];
  853|   335k|    bf1[19] = in[25];
  854|   335k|    bf1[20] = in[5];
  855|   335k|    bf1[21] = in[21];
  856|   335k|    bf1[22] = in[13];
  857|   335k|    bf1[23] = in[29];
  858|   335k|    bf1[24] = in[3];
  859|   335k|    bf1[25] = in[19];
  860|   335k|    bf1[26] = in[11];
  861|   335k|    bf1[27] = in[27];
  862|   335k|    bf1[28] = in[7];
  863|   335k|    bf1[29] = in[23];
  864|   335k|    bf1[30] = in[15];
  865|   335k|    bf1[31] = in[31];
  866|       |
  867|       |    // stage 2
  868|   335k|    bf0[0] = bf1[0];
  869|   335k|    bf0[1] = bf1[1];
  870|   335k|    bf0[2] = bf1[2];
  871|   335k|    bf0[3] = bf1[3];
  872|   335k|    bf0[4] = bf1[4];
  873|   335k|    bf0[5] = bf1[5];
  874|   335k|    bf0[6] = bf1[6];
  875|   335k|    bf0[7] = bf1[7];
  876|   335k|    bf0[8] = bf1[8];
  877|   335k|    bf0[9] = bf1[9];
  878|   335k|    bf0[10] = bf1[10];
  879|   335k|    bf0[11] = bf1[11];
  880|   335k|    bf0[12] = bf1[12];
  881|   335k|    bf0[13] = bf1[13];
  882|   335k|    bf0[14] = bf1[14];
  883|   335k|    bf0[15] = bf1[15];
  884|   335k|    bf0[16] =
  885|   335k|        half_btf_avx2(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
  886|   335k|    bf0[17] =
  887|   335k|        half_btf_avx2(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
  888|   335k|    bf0[18] =
  889|   335k|        half_btf_avx2(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
  890|   335k|    bf0[19] =
  891|   335k|        half_btf_avx2(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
  892|   335k|    bf0[20] =
  893|   335k|        half_btf_avx2(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
  894|   335k|    bf0[21] =
  895|   335k|        half_btf_avx2(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
  896|   335k|    bf0[22] =
  897|   335k|        half_btf_avx2(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
  898|   335k|    bf0[23] =
  899|   335k|        half_btf_avx2(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
  900|   335k|    bf0[24] =
  901|   335k|        half_btf_avx2(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
  902|   335k|    bf0[25] =
  903|   335k|        half_btf_avx2(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
  904|   335k|    bf0[26] =
  905|   335k|        half_btf_avx2(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
  906|   335k|    bf0[27] =
  907|   335k|        half_btf_avx2(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
  908|   335k|    bf0[28] =
  909|   335k|        half_btf_avx2(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
  910|   335k|    bf0[29] =
  911|   335k|        half_btf_avx2(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
  912|   335k|    bf0[30] =
  913|   335k|        half_btf_avx2(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
  914|   335k|    bf0[31] =
  915|   335k|        half_btf_avx2(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
  916|       |
  917|       |    // stage 3
  918|   335k|    bf1[0] = bf0[0];
  919|   335k|    bf1[1] = bf0[1];
  920|   335k|    bf1[2] = bf0[2];
  921|   335k|    bf1[3] = bf0[3];
  922|   335k|    bf1[4] = bf0[4];
  923|   335k|    bf1[5] = bf0[5];
  924|   335k|    bf1[6] = bf0[6];
  925|   335k|    bf1[7] = bf0[7];
  926|   335k|    bf1[8] =
  927|   335k|        half_btf_avx2(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
  928|   335k|    bf1[9] =
  929|   335k|        half_btf_avx2(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
  930|   335k|    bf1[10] =
  931|   335k|        half_btf_avx2(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
  932|   335k|    bf1[11] =
  933|   335k|        half_btf_avx2(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
  934|   335k|    bf1[12] =
  935|   335k|        half_btf_avx2(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
  936|   335k|    bf1[13] =
  937|   335k|        half_btf_avx2(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
  938|   335k|    bf1[14] =
  939|   335k|        half_btf_avx2(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
  940|   335k|    bf1[15] =
  941|   335k|        half_btf_avx2(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
  942|       |
  943|   335k|    addsub_avx2(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
  944|   335k|    addsub_avx2(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
  945|   335k|    addsub_avx2(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
  946|   335k|    addsub_avx2(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
  947|   335k|    addsub_avx2(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
  948|   335k|    addsub_avx2(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
  949|   335k|    addsub_avx2(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
  950|   335k|    addsub_avx2(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
  951|       |
  952|       |    // stage 4
  953|   335k|    bf0[0] = bf1[0];
  954|   335k|    bf0[1] = bf1[1];
  955|   335k|    bf0[2] = bf1[2];
  956|   335k|    bf0[3] = bf1[3];
  957|   335k|    bf0[4] =
  958|   335k|        half_btf_avx2(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
  959|   335k|    bf0[5] =
  960|   335k|        half_btf_avx2(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
  961|   335k|    bf0[6] =
  962|   335k|        half_btf_avx2(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
  963|   335k|    bf0[7] = half_btf_avx2(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
  964|       |
  965|   335k|    addsub_avx2(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
  966|   335k|    addsub_avx2(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
  967|   335k|    addsub_avx2(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
  968|   335k|    addsub_avx2(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
  969|       |
  970|   335k|    bf0[16] = bf1[16];
  971|   335k|    bf0[17] =
  972|   335k|        half_btf_avx2(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
  973|   335k|    bf0[18] =
  974|   335k|        half_btf_avx2(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
  975|   335k|    bf0[19] = bf1[19];
  976|   335k|    bf0[20] = bf1[20];
  977|   335k|    bf0[21] =
  978|   335k|        half_btf_avx2(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
  979|   335k|    bf0[22] =
  980|   335k|        half_btf_avx2(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
  981|   335k|    bf0[23] = bf1[23];
  982|   335k|    bf0[24] = bf1[24];
  983|   335k|    bf0[25] =
  984|   335k|        half_btf_avx2(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
  985|   335k|    bf0[26] =
  986|   335k|        half_btf_avx2(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
  987|   335k|    bf0[27] = bf1[27];
  988|   335k|    bf0[28] = bf1[28];
  989|   335k|    bf0[29] =
  990|   335k|        half_btf_avx2(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
  991|   335k|    bf0[30] =
  992|   335k|        half_btf_avx2(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
  993|   335k|    bf0[31] = bf1[31];
  994|       |
  995|       |    // stage 5
  996|   335k|    bf1[0] =
  997|   335k|        half_btf_avx2(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
  998|   335k|    bf1[1] =
  999|   335k|        half_btf_avx2(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
 1000|   335k|    bf1[2] =
 1001|   335k|        half_btf_avx2(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
 1002|   335k|    bf1[3] =
 1003|   335k|        half_btf_avx2(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
 1004|   335k|    addsub_avx2(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
 1005|   335k|    addsub_avx2(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
 1006|   335k|    bf1[8] = bf0[8];
 1007|   335k|    bf1[9] =
 1008|   335k|        half_btf_avx2(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
 1009|   335k|    bf1[10] =
 1010|   335k|        half_btf_avx2(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
 1011|   335k|    bf1[11] = bf0[11];
 1012|   335k|    bf1[12] = bf0[12];
 1013|   335k|    bf1[13] =
 1014|   335k|        half_btf_avx2(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
 1015|   335k|    bf1[14] =
 1016|   335k|        half_btf_avx2(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
 1017|   335k|    bf1[15] = bf0[15];
 1018|   335k|    addsub_avx2(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
 1019|   335k|    addsub_avx2(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
 1020|   335k|    addsub_avx2(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
 1021|   335k|    addsub_avx2(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
 1022|   335k|    addsub_avx2(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
 1023|   335k|    addsub_avx2(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
 1024|   335k|    addsub_avx2(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
 1025|   335k|    addsub_avx2(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
 1026|       |
 1027|       |    // stage 6
 1028|   335k|    addsub_avx2(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
 1029|   335k|    addsub_avx2(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
 1030|   335k|    bf0[4] = bf1[4];
 1031|   335k|    bf0[5] =
 1032|   335k|        half_btf_avx2(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
 1033|   335k|    bf0[6] =
 1034|   335k|        half_btf_avx2(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
 1035|   335k|    bf0[7] = bf1[7];
 1036|   335k|    addsub_avx2(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
 1037|   335k|    addsub_avx2(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
 1038|   335k|    addsub_avx2(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
 1039|   335k|    addsub_avx2(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
 1040|   335k|    bf0[16] = bf1[16];
 1041|   335k|    bf0[17] = bf1[17];
 1042|   335k|    bf0[18] =
 1043|   335k|        half_btf_avx2(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
 1044|   335k|    bf0[19] =
 1045|   335k|        half_btf_avx2(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
 1046|   335k|    bf0[20] =
 1047|   335k|        half_btf_avx2(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
 1048|   335k|    bf0[21] =
 1049|   335k|        half_btf_avx2(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
 1050|   335k|    bf0[22] = bf1[22];
 1051|   335k|    bf0[23] = bf1[23];
 1052|   335k|    bf0[24] = bf1[24];
 1053|   335k|    bf0[25] = bf1[25];
 1054|   335k|    bf0[26] =
 1055|   335k|        half_btf_avx2(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
 1056|   335k|    bf0[27] =
 1057|   335k|        half_btf_avx2(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
 1058|   335k|    bf0[28] =
 1059|   335k|        half_btf_avx2(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
 1060|   335k|    bf0[29] =
 1061|   335k|        half_btf_avx2(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
 1062|   335k|    bf0[30] = bf1[30];
 1063|   335k|    bf0[31] = bf1[31];
 1064|       |
 1065|       |    // stage 7
 1066|   335k|    addsub_avx2(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
 1067|   335k|    addsub_avx2(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
 1068|   335k|    addsub_avx2(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
 1069|   335k|    addsub_avx2(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
 1070|   335k|    bf1[8] = bf0[8];
 1071|   335k|    bf1[9] = bf0[9];
 1072|   335k|    bf1[10] =
 1073|   335k|        half_btf_avx2(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
 1074|   335k|    bf1[11] =
 1075|   335k|        half_btf_avx2(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
 1076|   335k|    bf1[12] =
 1077|   335k|        half_btf_avx2(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
 1078|   335k|    bf1[13] =
 1079|   335k|        half_btf_avx2(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
 1080|   335k|    bf1[14] = bf0[14];
 1081|   335k|    bf1[15] = bf0[15];
 1082|   335k|    addsub_avx2(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
 1083|   335k|    addsub_avx2(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
 1084|   335k|    addsub_avx2(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
 1085|   335k|    addsub_avx2(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
 1086|   335k|    addsub_avx2(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
 1087|   335k|    addsub_avx2(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
 1088|   335k|    addsub_avx2(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
 1089|   335k|    addsub_avx2(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
 1090|       |
 1091|       |    // stage 8
 1092|   335k|    addsub_avx2(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
 1093|   335k|    addsub_avx2(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
 1094|   335k|    addsub_avx2(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
 1095|   335k|    addsub_avx2(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
 1096|   335k|    addsub_avx2(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
 1097|   335k|    addsub_avx2(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
 1098|   335k|    addsub_avx2(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
 1099|   335k|    addsub_avx2(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
 1100|   335k|    bf0[16] = bf1[16];
 1101|   335k|    bf0[17] = bf1[17];
 1102|   335k|    bf0[18] = bf1[18];
 1103|   335k|    bf0[19] = bf1[19];
 1104|   335k|    bf0[20] =
 1105|   335k|        half_btf_avx2(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
 1106|   335k|    bf0[21] =
 1107|   335k|        half_btf_avx2(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
 1108|   335k|    bf0[22] =
 1109|   335k|        half_btf_avx2(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
 1110|   335k|    bf0[23] =
 1111|   335k|        half_btf_avx2(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
 1112|   335k|    bf0[24] =
 1113|   335k|        half_btf_avx2(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
 1114|   335k|    bf0[25] =
 1115|   335k|        half_btf_avx2(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
 1116|   335k|    bf0[26] =
 1117|   335k|        half_btf_avx2(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
 1118|   335k|    bf0[27] =
 1119|   335k|        half_btf_avx2(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
 1120|   335k|    bf0[28] = bf1[28];
 1121|   335k|    bf0[29] = bf1[29];
 1122|   335k|    bf0[30] = bf1[30];
 1123|   335k|    bf0[31] = bf1[31];
 1124|       |
 1125|       |    // stage 9
 1126|   335k|    addsub_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi);
 1127|   335k|    addsub_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi);
 1128|   335k|    addsub_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi);
 1129|   335k|    addsub_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi);
 1130|   335k|    addsub_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi);
 1131|   335k|    addsub_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi);
 1132|   335k|    addsub_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi);
 1133|   335k|    addsub_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi);
 1134|   335k|    addsub_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi);
 1135|   335k|    addsub_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi);
 1136|   335k|    addsub_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi);
 1137|   335k|    addsub_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi);
 1138|   335k|    addsub_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi);
 1139|   335k|    addsub_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi);
 1140|   335k|    addsub_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi);
 1141|   335k|    addsub_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi);
 1142|   335k|    if (!do_cols) {
  ------------------
  |  Branch (1142:9): [True: 169k, False: 166k]
  ------------------
 1143|   169k|      const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|   169k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 169k]
  |  |  ------------------
  ------------------
 1144|   169k|      const __m256i clamp_lo_out =
 1145|   169k|          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
 1146|   169k|      const __m256i clamp_hi_out =
 1147|   169k|          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
 1148|   169k|      round_shift_8x8_avx2(out, out_shift);
 1149|   169k|      round_shift_8x8_avx2(out + 16, out_shift);
 1150|   169k|      highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32);
 1151|   169k|    }
 1152|   335k|  }
 1153|   335k|}
highbd_inv_txfm_avx2.c:idct64_low1_avx2:
 2959|   353k|                             int bd, int out_shift) {
 2960|   353k|  const int32_t *cospi = cospi_arr(bit);
 2961|   353k|  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
 2962|   353k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|   707k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 353k]
  |  |  |  Branch (35:31): [True: 269k, False: 84.5k]
  |  |  |  Branch (35:44): [True: 269k, False: 84.5k]
  |  |  ------------------
  ------------------
 2963|   353k|  __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
 2964|   353k|  __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
 2965|       |
 2966|   353k|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
 2967|       |
 2968|   353k|  {
 2969|   353k|    __m256i x;
 2970|       |
 2971|       |    // stage 1
 2972|       |    // stage 2
 2973|       |    // stage 3
 2974|       |    // stage 4
 2975|       |    // stage 5
 2976|       |    // stage 6
 2977|   353k|    x = half_btf_0_avx2(&cospi32, &in[0], &rnding, bit);
 2978|       |
 2979|       |    // stage 8
 2980|       |    // stage 9
 2981|       |    // stage 10
 2982|       |    // stage 11
 2983|   353k|    if (!do_cols) {
  ------------------
  |  Branch (2983:9): [True: 84.5k, False: 269k]
  ------------------
 2984|  84.5k|      const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  84.5k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 84.5k]
  |  |  ------------------
  ------------------
 2985|  84.5k|      clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
 2986|  84.5k|      clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
 2987|  84.5k|      if (out_shift != 0) {
  ------------------
  |  Branch (2987:11): [True: 84.5k, False: 2]
  ------------------
 2988|  84.5k|        __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
 2989|  84.5k|        x = _mm256_add_epi32(x, offset);
 2990|  84.5k|        x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
 2991|  84.5k|      }
 2992|  84.5k|    }
 2993|   353k|    x = _mm256_max_epi32(x, clamp_lo);
 2994|   353k|    x = _mm256_min_epi32(x, clamp_hi);
 2995|   353k|    out[0] = x;
 2996|   353k|    out[1] = x;
 2997|   353k|    out[2] = x;
 2998|   353k|    out[3] = x;
 2999|   353k|    out[4] = x;
 3000|   353k|    out[5] = x;
 3001|   353k|    out[6] = x;
 3002|   353k|    out[7] = x;
 3003|   353k|    out[8] = x;
 3004|   353k|    out[9] = x;
 3005|   353k|    out[10] = x;
 3006|   353k|    out[11] = x;
 3007|   353k|    out[12] = x;
 3008|   353k|    out[13] = x;
 3009|   353k|    out[14] = x;
 3010|   353k|    out[15] = x;
 3011|   353k|    out[16] = x;
 3012|   353k|    out[17] = x;
 3013|   353k|    out[18] = x;
 3014|   353k|    out[19] = x;
 3015|   353k|    out[20] = x;
 3016|   353k|    out[21] = x;
 3017|   353k|    out[22] = x;
 3018|   353k|    out[23] = x;
 3019|   353k|    out[24] = x;
 3020|   353k|    out[25] = x;
 3021|   353k|    out[26] = x;
 3022|   353k|    out[27] = x;
 3023|   353k|    out[28] = x;
 3024|   353k|    out[29] = x;
 3025|   353k|    out[30] = x;
 3026|   353k|    out[31] = x;
 3027|   353k|    out[32] = x;
 3028|   353k|    out[33] = x;
 3029|   353k|    out[34] = x;
 3030|   353k|    out[35] = x;
 3031|   353k|    out[36] = x;
 3032|   353k|    out[37] = x;
 3033|   353k|    out[38] = x;
 3034|   353k|    out[39] = x;
 3035|   353k|    out[40] = x;
 3036|   353k|    out[41] = x;
 3037|   353k|    out[42] = x;
 3038|   353k|    out[43] = x;
 3039|   353k|    out[44] = x;
 3040|   353k|    out[45] = x;
 3041|   353k|    out[46] = x;
 3042|   353k|    out[47] = x;
 3043|   353k|    out[48] = x;
 3044|   353k|    out[49] = x;
 3045|   353k|    out[50] = x;
 3046|   353k|    out[51] = x;
 3047|   353k|    out[52] = x;
 3048|   353k|    out[53] = x;
 3049|   353k|    out[54] = x;
 3050|   353k|    out[55] = x;
 3051|   353k|    out[56] = x;
 3052|   353k|    out[57] = x;
 3053|   353k|    out[58] = x;
 3054|   353k|    out[59] = x;
 3055|   353k|    out[60] = x;
 3056|   353k|    out[61] = x;
 3057|   353k|    out[62] = x;
 3058|   353k|    out[63] = x;
 3059|   353k|  }
 3060|   353k|}
highbd_inv_txfm_avx2.c:idct64_low8_avx2:
 3062|   390k|                             int bd, int out_shift) {
 3063|   390k|  int i, j;
 3064|   390k|  const int32_t *cospi = cospi_arr(bit);
 3065|   390k|  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
 3066|   390k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|   780k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 390k]
  |  |  |  Branch (35:31): [True: 276k, False: 113k]
  |  |  |  Branch (35:44): [True: 276k, False: 113k]
  |  |  ------------------
  ------------------
 3067|   390k|  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
 3068|   390k|  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
 3069|       |
 3070|   390k|  const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
 3071|   390k|  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
 3072|   390k|  const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
 3073|   390k|  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
 3074|   390k|  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
 3075|   390k|  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
 3076|   390k|  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
 3077|   390k|  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
 3078|   390k|  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
 3079|   390k|  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
 3080|   390k|  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
 3081|   390k|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
 3082|   390k|  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
 3083|   390k|  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
 3084|   390k|  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
 3085|   390k|  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
 3086|   390k|  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
 3087|   390k|  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
 3088|   390k|  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
 3089|   390k|  const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
 3090|   390k|  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
 3091|   390k|  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
 3092|   390k|  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
 3093|   390k|  const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
 3094|   390k|  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
 3095|   390k|  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
 3096|   390k|  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
 3097|   390k|  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
 3098|   390k|  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
 3099|   390k|  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
 3100|   390k|  const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
 3101|   390k|  const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
 3102|   390k|  const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
 3103|   390k|  const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
 3104|   390k|  const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
 3105|   390k|  const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
 3106|   390k|  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
 3107|   390k|  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
 3108|       |
 3109|   390k|  {
 3110|   390k|    __m256i u[64];
 3111|       |
 3112|       |    // stage 1
 3113|   390k|    u[0] = in[0];
 3114|   390k|    u[8] = in[4];
 3115|   390k|    u[16] = in[2];
 3116|   390k|    u[24] = in[6];
 3117|   390k|    u[32] = in[1];
 3118|   390k|    u[40] = in[5];
 3119|   390k|    u[48] = in[3];
 3120|   390k|    u[56] = in[7];
 3121|       |
 3122|       |    // stage 2
 3123|   390k|    u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
 3124|   390k|    u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
 3125|   390k|    u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
 3126|   390k|    u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
 3127|   390k|    u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
 3128|   390k|    u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
 3129|   390k|    u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
 3130|   390k|    u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
 3131|       |
 3132|       |    // stage 3
 3133|   390k|    u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit);
 3134|   390k|    u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit);
 3135|   390k|    u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit);
 3136|   390k|    u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit);
 3137|   390k|    u[33] = u[32];
 3138|   390k|    u[38] = u[39];
 3139|   390k|    u[41] = u[40];
 3140|   390k|    u[46] = u[47];
 3141|   390k|    u[49] = u[48];
 3142|   390k|    u[54] = u[55];
 3143|   390k|    u[57] = u[56];
 3144|   390k|    u[62] = u[63];
 3145|       |
 3146|       |    // stage 4
 3147|   390k|    __m256i temp1, temp2;
 3148|   390k|    u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
 3149|   390k|    u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
 3150|   390k|    u[17] = u[16];
 3151|   390k|    u[22] = u[23];
 3152|   390k|    u[25] = u[24];
 3153|   390k|    u[30] = u[31];
 3154|       |
 3155|   390k|    temp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
 3156|   390k|    u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
 3157|   390k|    u[33] = temp1;
 3158|       |
 3159|   390k|    temp2 = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
 3160|   390k|    u[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
 3161|   390k|    u[57] = temp2;
 3162|       |
 3163|   390k|    temp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
 3164|   390k|    u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
 3165|   390k|    u[41] = temp1;
 3166|       |
 3167|   390k|    temp2 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
 3168|   390k|    u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
 3169|   390k|    u[46] = temp2;
 3170|       |
 3171|       |    // stage 5
 3172|   390k|    u[9] = u[8];
 3173|   390k|    u[14] = u[15];
 3174|       |
 3175|   390k|    temp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
 3176|   390k|    u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
 3177|   390k|    u[17] = temp1;
 3178|       |
 3179|   390k|    temp2 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
 3180|   390k|    u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
 3181|   390k|    u[22] = temp2;
 3182|       |
 3183|   390k|    u[35] = u[32];
 3184|   390k|    u[34] = u[33];
 3185|   390k|    u[36] = u[39];
 3186|   390k|    u[37] = u[38];
 3187|   390k|    u[43] = u[40];
 3188|   390k|    u[42] = u[41];
 3189|   390k|    u[44] = u[47];
 3190|   390k|    u[45] = u[46];
 3191|   390k|    u[51] = u[48];
 3192|   390k|    u[50] = u[49];
 3193|   390k|    u[52] = u[55];
 3194|   390k|    u[53] = u[54];
 3195|   390k|    u[59] = u[56];
 3196|   390k|    u[58] = u[57];
 3197|   390k|    u[60] = u[63];
 3198|   390k|    u[61] = u[62];
 3199|       |
 3200|       |    // stage 6
 3201|   390k|    temp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
 3202|   390k|    u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
 3203|   390k|    u[0] = temp1;
 3204|       |
 3205|   390k|    temp2 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
 3206|   390k|    u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
 3207|   390k|    u[9] = temp2;
 3208|   390k|    u[19] = u[16];
 3209|   390k|    u[18] = u[17];
 3210|   390k|    u[20] = u[23];
 3211|   390k|    u[21] = u[22];
 3212|   390k|    u[27] = u[24];
 3213|   390k|    u[26] = u[25];
 3214|   390k|    u[28] = u[31];
 3215|   390k|    u[29] = u[30];
 3216|       |
 3217|   390k|    temp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
 3218|   390k|    u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
 3219|   390k|    u[34] = temp1;
 3220|   390k|    temp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
 3221|   390k|    u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
 3222|   390k|    u[35] = temp2;
 3223|   390k|    temp1 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
 3224|   390k|    u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
 3225|   390k|    u[36] = temp1;
 3226|   390k|    temp2 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
 3227|   390k|    u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
 3228|   390k|    u[37] = temp2;
 3229|   390k|    temp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
 3230|   390k|    u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
 3231|   390k|    u[42] = temp1;
 3232|   390k|    temp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
 3233|   390k|    u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
 3234|   390k|    u[43] = temp2;
 3235|   390k|    temp1 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
 3236|   390k|    u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
 3237|   390k|    u[44] = temp1;
 3238|   390k|    temp2 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
 3239|   390k|    u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
 3240|   390k|    u[45] = temp2;
 3241|       |
 3242|       |    // stage 7
 3243|   390k|    u[3] = u[0];
 3244|   390k|    u[2] = u[1];
 3245|   390k|    u[11] = u[8];
 3246|   390k|    u[10] = u[9];
 3247|   390k|    u[12] = u[15];
 3248|   390k|    u[13] = u[14];
 3249|       |
 3250|   390k|    temp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
 3251|   390k|    u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
 3252|   390k|    u[18] = temp1;
 3253|   390k|    temp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
 3254|   390k|    u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
 3255|   390k|    u[19] = temp2;
 3256|   390k|    temp1 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
 3257|   390k|    u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
 3258|   390k|    u[20] = temp1;
 3259|   390k|    temp2 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
 3260|   390k|    u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
 3261|   390k|    u[21] = temp2;
 3262|  1.17M|    for (i = 32; i < 64; i += 16) {
  ------------------
  |  Branch (3262:18): [True: 780k, False: 390k]
  ------------------
 3263|  3.90M|      for (j = i; j < i + 4; j++) {
  ------------------
  |  Branch (3263:19): [True: 3.12M, False: 780k]
  ------------------
 3264|  3.12M|        addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
 3265|  3.12M|        addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
 3266|  3.12M|                    &clamp_hi);
 3267|  3.12M|      }
 3268|   780k|    }
 3269|       |
 3270|       |    // stage 8
 3271|   390k|    u[7] = u[0];
 3272|   390k|    u[6] = u[1];
 3273|   390k|    u[5] = u[2];
 3274|   390k|    u[4] = u[3];
 3275|       |
 3276|   390k|    idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
 3277|   390k|                       &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
 3278|       |
 3279|       |    // stage 9
 3280|   390k|    idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
 3281|   390k|                       bit);
 3282|       |
 3283|       |    // stage 10
 3284|   390k|    idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
 3285|   390k|                        bit);
 3286|       |
 3287|       |    // stage 11
 3288|   390k|    idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
 3289|   390k|  }
 3290|   390k|}
highbd_inv_txfm_avx2.c:idct64_stage8_avx2:
 2825|   711k|    const __m256i *rnding, int bit) {
 2826|   711k|  int i;
 2827|   711k|  __m256i temp1, temp2, temp3, temp4;
 2828|   711k|  temp1 = half_btf_avx2(cospim32, &u[10], cospi32, &u[13], rnding, bit);
 2829|   711k|  u[13] = half_btf_avx2(cospi32, &u[10], cospi32, &u[13], rnding, bit);
 2830|   711k|  u[10] = temp1;
 2831|   711k|  temp2 = half_btf_avx2(cospim32, &u[11], cospi32, &u[12], rnding, bit);
 2832|   711k|  u[12] = half_btf_avx2(cospi32, &u[11], cospi32, &u[12], rnding, bit);
 2833|   711k|  u[11] = temp2;
 2834|       |
 2835|  3.55M|  for (i = 16; i < 20; ++i) {
  ------------------
  |  Branch (2835:16): [True: 2.84M, False: 711k]
  ------------------
 2836|  2.84M|    addsub_avx2(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
 2837|  2.84M|    addsub_avx2(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi);
 2838|  2.84M|  }
 2839|       |
 2840|   711k|  temp1 = half_btf_avx2(cospim16, &u[36], cospi48, &u[59], rnding, bit);
 2841|   711k|  temp2 = half_btf_avx2(cospim16, &u[37], cospi48, &u[58], rnding, bit);
 2842|   711k|  temp3 = half_btf_avx2(cospim16, &u[38], cospi48, &u[57], rnding, bit);
 2843|   711k|  temp4 = half_btf_avx2(cospim16, &u[39], cospi48, &u[56], rnding, bit);
 2844|   711k|  u[56] = half_btf_avx2(cospi48, &u[39], cospi16, &u[56], rnding, bit);
 2845|   711k|  u[57] = half_btf_avx2(cospi48, &u[38], cospi16, &u[57], rnding, bit);
 2846|   711k|  u[58] = half_btf_avx2(cospi48, &u[37], cospi16, &u[58], rnding, bit);
 2847|   711k|  u[59] = half_btf_avx2(cospi48, &u[36], cospi16, &u[59], rnding, bit);
 2848|   711k|  u[36] = temp1;
 2849|   711k|  u[37] = temp2;
 2850|   711k|  u[38] = temp3;
 2851|   711k|  u[39] = temp4;
 2852|       |
 2853|   711k|  temp1 = half_btf_avx2(cospim48, &u[40], cospim16, &u[55], rnding, bit);
 2854|   711k|  temp2 = half_btf_avx2(cospim48, &u[41], cospim16, &u[54], rnding, bit);
 2855|   711k|  temp3 = half_btf_avx2(cospim48, &u[42], cospim16, &u[53], rnding, bit);
 2856|   711k|  temp4 = half_btf_avx2(cospim48, &u[43], cospim16, &u[52], rnding, bit);
 2857|   711k|  u[52] = half_btf_avx2(cospim16, &u[43], cospi48, &u[52], rnding, bit);
 2858|   711k|  u[53] = half_btf_avx2(cospim16, &u[42], cospi48, &u[53], rnding, bit);
 2859|   711k|  u[54] = half_btf_avx2(cospim16, &u[41], cospi48, &u[54], rnding, bit);
 2860|   711k|  u[55] = half_btf_avx2(cospim16, &u[40], cospi48, &u[55], rnding, bit);
 2861|   711k|  u[40] = temp1;
 2862|   711k|  u[41] = temp2;
 2863|   711k|  u[42] = temp3;
 2864|   711k|  u[43] = temp4;
 2865|   711k|}
highbd_inv_txfm_avx2.c:idct64_stage9_avx2:
 2871|   711k|                                      const __m256i *rnding, int bit) {
 2872|   711k|  int i;
 2873|   711k|  __m256i temp1, temp2, temp3, temp4;
 2874|  6.39M|  for (i = 0; i < 8; ++i) {
  ------------------
  |  Branch (2874:15): [True: 5.68M, False: 711k]
  ------------------
 2875|  5.68M|    addsub_avx2(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
 2876|  5.68M|  }
 2877|       |
 2878|   711k|  temp1 = half_btf_avx2(cospim32, &u[20], cospi32, &u[27], rnding, bit);
 2879|   711k|  temp2 = half_btf_avx2(cospim32, &u[21], cospi32, &u[26], rnding, bit);
 2880|   711k|  temp3 = half_btf_avx2(cospim32, &u[22], cospi32, &u[25], rnding, bit);
 2881|   711k|  temp4 = half_btf_avx2(cospim32, &u[23], cospi32, &u[24], rnding, bit);
 2882|   711k|  u[24] = half_btf_avx2(cospi32, &u[23], cospi32, &u[24], rnding, bit);
 2883|   711k|  u[25] = half_btf_avx2(cospi32, &u[22], cospi32, &u[25], rnding, bit);
 2884|   711k|  u[26] = half_btf_avx2(cospi32, &u[21], cospi32, &u[26], rnding, bit);
 2885|   711k|  u[27] = half_btf_avx2(cospi32, &u[20], cospi32, &u[27], rnding, bit);
 2886|   711k|  u[20] = temp1;
 2887|   711k|  u[21] = temp2;
 2888|   711k|  u[22] = temp3;
 2889|   711k|  u[23] = temp4;
 2890|  6.39M|  for (i = 32; i < 40; i++) {
  ------------------
  |  Branch (2890:16): [True: 5.68M, False: 711k]
  ------------------
 2891|  5.68M|    addsub_avx2(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
 2892|  5.68M|  }
 2893|       |
 2894|  6.39M|  for (i = 48; i < 56; i++) {
  ------------------
  |  Branch (2894:16): [True: 5.68M, False: 711k]
  ------------------
 2895|  5.68M|    addsub_avx2(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
 2896|  5.68M|  }
 2897|   711k|}
highbd_inv_txfm_avx2.c:idct64_stage10_avx2:
 2903|   710k|                                       const __m256i *rnding, int bit) {
 2904|   710k|  __m256i temp1, temp2, temp3, temp4;
 2905|  12.0M|  for (int i = 0; i < 16; i++) {
  ------------------
  |  Branch (2905:19): [True: 11.3M, False: 710k]
  ------------------
 2906|  11.3M|    addsub_avx2(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
 2907|  11.3M|  }
 2908|       |
 2909|   710k|  temp1 = half_btf_avx2(cospim32, &u[40], cospi32, &u[55], rnding, bit);
 2910|   710k|  temp2 = half_btf_avx2(cospim32, &u[41], cospi32, &u[54], rnding, bit);
 2911|   710k|  temp3 = half_btf_avx2(cospim32, &u[42], cospi32, &u[53], rnding, bit);
 2912|   710k|  temp4 = half_btf_avx2(cospim32, &u[43], cospi32, &u[52], rnding, bit);
 2913|   710k|  u[52] = half_btf_avx2(cospi32, &u[43], cospi32, &u[52], rnding, bit);
 2914|   710k|  u[53] = half_btf_avx2(cospi32, &u[42], cospi32, &u[53], rnding, bit);
 2915|   710k|  u[54] = half_btf_avx2(cospi32, &u[41], cospi32, &u[54], rnding, bit);
 2916|   710k|  u[55] = half_btf_avx2(cospi32, &u[40], cospi32, &u[55], rnding, bit);
 2917|   710k|  u[40] = temp1;
 2918|   710k|  u[41] = temp2;
 2919|   710k|  u[42] = temp3;
 2920|   710k|  u[43] = temp4;
 2921|       |
 2922|   710k|  temp1 = half_btf_avx2(cospim32, &u[44], cospi32, &u[51], rnding, bit);
 2923|   710k|  temp2 = half_btf_avx2(cospim32, &u[45], cospi32, &u[50], rnding, bit);
 2924|   710k|  temp3 = half_btf_avx2(cospim32, &u[46], cospi32, &u[49], rnding, bit);
 2925|   710k|  temp4 = half_btf_avx2(cospim32, &u[47], cospi32, &u[48], rnding, bit);
 2926|   710k|  u[48] = half_btf_avx2(cospi32, &u[47], cospi32, &u[48], rnding, bit);
 2927|   710k|  u[49] = half_btf_avx2(cospi32, &u[46], cospi32, &u[49], rnding, bit);
 2928|   710k|  u[50] = half_btf_avx2(cospi32, &u[45], cospi32, &u[50], rnding, bit);
 2929|   710k|  u[51] = half_btf_avx2(cospi32, &u[44], cospi32, &u[51], rnding, bit);
 2930|   710k|  u[44] = temp1;
 2931|   710k|  u[45] = temp2;
 2932|   710k|  u[46] = temp3;
 2933|   710k|  u[47] = temp4;
 2934|   710k|}
highbd_inv_txfm_avx2.c:idct64_stage11_avx2:
 2939|   710k|                                       const __m256i *clamp_hi) {
 2940|  23.4M|  for (int i = 0; i < 32; i++) {
  ------------------
  |  Branch (2940:19): [True: 22.7M, False: 710k]
  ------------------
 2941|  22.7M|    addsub_avx2(u[i], u[63 - i], &out[(i)], &out[(63 - i)], clamp_lo, clamp_hi);
 2942|  22.7M|  }
 2943|       |
 2944|   710k|  if (!do_cols) {
  ------------------
  |  Branch (2944:7): [True: 256k, False: 454k]
  ------------------
 2945|   256k|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|   256k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 256k]
  |  |  ------------------
  ------------------
 2946|   256k|    const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
 2947|   256k|    const __m256i clamp_hi_out =
 2948|   256k|        _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
 2949|       |
 2950|   256k|    round_shift_8x8_avx2(out, out_shift);
 2951|   256k|    round_shift_8x8_avx2(out + 16, out_shift);
 2952|   256k|    round_shift_8x8_avx2(out + 32, out_shift);
 2953|   256k|    round_shift_8x8_avx2(out + 48, out_shift);
 2954|   256k|    highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64);
 2955|   256k|  }
 2956|   710k|}
highbd_inv_txfm_avx2.c:idct64_low16_avx2:
 3292|   320k|                              int bd, int out_shift) {
 3293|   320k|  int i, j;
 3294|   320k|  const int32_t *cospi = cospi_arr(bit);
 3295|   320k|  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
 3296|   320k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|   641k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 320k]
  |  |  |  Branch (35:31): [True: 177k, False: 143k]
  |  |  |  Branch (35:44): [True: 177k, False: 143k]
  |  |  ------------------
  ------------------
 3297|   320k|  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
 3298|   320k|  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
 3299|       |
 3300|   320k|  const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
 3301|   320k|  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
 3302|   320k|  const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
 3303|   320k|  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
 3304|   320k|  const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
 3305|   320k|  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
 3306|   320k|  const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
 3307|   320k|  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
 3308|   320k|  const __m256i cospi9 = _mm256_set1_epi32(cospi[9]);
 3309|   320k|  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
 3310|   320k|  const __m256i cospi11 = _mm256_set1_epi32(cospi[11]);
 3311|   320k|  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
 3312|   320k|  const __m256i cospi13 = _mm256_set1_epi32(cospi[13]);
 3313|   320k|  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
 3314|   320k|  const __m256i cospi15 = _mm256_set1_epi32(cospi[15]);
 3315|   320k|  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
 3316|   320k|  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
 3317|   320k|  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
 3318|   320k|  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
 3319|   320k|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
 3320|   320k|  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
 3321|   320k|  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
 3322|   320k|  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
 3323|   320k|  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
 3324|   320k|  const __m256i cospi51 = _mm256_set1_epi32(cospi[51]);
 3325|   320k|  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
 3326|   320k|  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
 3327|   320k|  const __m256i cospi55 = _mm256_set1_epi32(cospi[55]);
 3328|   320k|  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
 3329|   320k|  const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
 3330|   320k|  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
 3331|   320k|  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
 3332|   320k|  const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
 3333|       |
 3334|   320k|  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
 3335|   320k|  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
 3336|   320k|  const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
 3337|   320k|  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
 3338|   320k|  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
 3339|   320k|  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
 3340|   320k|  const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
 3341|   320k|  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
 3342|   320k|  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
 3343|   320k|  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
 3344|   320k|  const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]);
 3345|   320k|  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
 3346|   320k|  const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]);
 3347|   320k|  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
 3348|   320k|  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
 3349|   320k|  const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]);
 3350|   320k|  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
 3351|   320k|  const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
 3352|   320k|  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
 3353|   320k|  const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]);
 3354|   320k|  const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
 3355|       |
 3356|   320k|  {
 3357|   320k|    __m256i u[64];
 3358|   320k|    __m256i tmp1, tmp2, tmp3, tmp4;
 3359|       |    // stage 1
 3360|   320k|    u[0] = in[0];
 3361|   320k|    u[32] = in[1];
 3362|   320k|    u[36] = in[9];
 3363|   320k|    u[40] = in[5];
 3364|   320k|    u[44] = in[13];
 3365|   320k|    u[48] = in[3];
 3366|   320k|    u[52] = in[11];
 3367|   320k|    u[56] = in[7];
 3368|   320k|    u[60] = in[15];
 3369|   320k|    u[16] = in[2];
 3370|   320k|    u[20] = in[10];
 3371|   320k|    u[24] = in[6];
 3372|   320k|    u[28] = in[14];
 3373|   320k|    u[4] = in[8];
 3374|   320k|    u[8] = in[4];
 3375|   320k|    u[12] = in[12];
 3376|       |
 3377|       |    // stage 2
 3378|   320k|    u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
 3379|   320k|    u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
 3380|   320k|    u[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit);
 3381|   320k|    u[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit);
 3382|   320k|    u[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit);
 3383|   320k|    u[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit);
 3384|   320k|    u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
 3385|   320k|    u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
 3386|   320k|    u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
 3387|   320k|    u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
 3388|   320k|    u[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit);
 3389|   320k|    u[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit);
 3390|   320k|    u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
 3391|   320k|    u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
 3392|   320k|    u[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit);
 3393|   320k|    u[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit);
 3394|       |
 3395|       |    // stage 3
 3396|   320k|    u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit);
 3397|   320k|    u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit);
 3398|   320k|    u[19] = half_btf_0_avx2(&cospim50, &u[28], &rnding, bit);
 3399|   320k|    u[28] = half_btf_0_avx2(&cospi14, &u[28], &rnding, bit);
 3400|   320k|    u[27] = half_btf_0_avx2(&cospi10, &u[20], &rnding, bit);
 3401|   320k|    u[20] = half_btf_0_avx2(&cospi54, &u[20], &rnding, bit);
 3402|   320k|    u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit);
 3403|   320k|    u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit);
 3404|   320k|    u[33] = u[32];
 3405|   320k|    u[34] = u[35];
 3406|   320k|    u[37] = u[36];
 3407|   320k|    u[38] = u[39];
 3408|   320k|    u[41] = u[40];
 3409|   320k|    u[42] = u[43];
 3410|   320k|    u[45] = u[44];
 3411|   320k|    u[46] = u[47];
 3412|   320k|    u[49] = u[48];
 3413|   320k|    u[50] = u[51];
 3414|   320k|    u[53] = u[52];
 3415|   320k|    u[54] = u[55];
 3416|   320k|    u[57] = u[56];
 3417|   320k|    u[58] = u[59];
 3418|   320k|    u[61] = u[60];
 3419|   320k|    u[62] = u[63];
 3420|       |
 3421|       |    // stage 4
 3422|   320k|    u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
 3423|   320k|    u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
 3424|   320k|    u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
 3425|   320k|    u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
 3426|       |
 3427|   320k|    u[17] = u[16];
 3428|   320k|    u[18] = u[19];
 3429|   320k|    u[21] = u[20];
 3430|   320k|    u[22] = u[23];
 3431|   320k|    u[25] = u[24];
 3432|   320k|    u[26] = u[27];
 3433|   320k|    u[29] = u[28];
 3434|   320k|    u[30] = u[31];
 3435|       |
 3436|   320k|    tmp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
 3437|   320k|    tmp2 = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
 3438|   320k|    tmp3 = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
 3439|   320k|    tmp4 = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
 3440|   320k|    u[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
 3441|   320k|    u[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
 3442|   320k|    u[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
 3443|   320k|    u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
 3444|   320k|    u[33] = tmp1;
 3445|   320k|    u[34] = tmp2;
 3446|   320k|    u[37] = tmp3;
 3447|   320k|    u[38] = tmp4;
 3448|       |
 3449|   320k|    tmp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
 3450|   320k|    tmp2 = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
 3451|   320k|    tmp3 = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
 3452|   320k|    tmp4 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
 3453|   320k|    u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
 3454|   320k|    u[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
 3455|   320k|    u[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
 3456|   320k|    u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
 3457|   320k|    u[41] = tmp1;
 3458|   320k|    u[42] = tmp2;
 3459|   320k|    u[45] = tmp3;
 3460|   320k|    u[46] = tmp4;
 3461|       |
 3462|       |    // stage 5
 3463|   320k|    u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit);
 3464|   320k|    u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit);
 3465|       |
 3466|   320k|    u[9] = u[8];
 3467|   320k|    u[10] = u[11];
 3468|   320k|    u[13] = u[12];
 3469|   320k|    u[14] = u[15];
 3470|       |
 3471|   320k|    tmp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
 3472|   320k|    tmp2 = half_btf_avx2(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
 3473|   320k|    tmp3 = half_btf_avx2(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
 3474|   320k|    tmp4 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
 3475|   320k|    u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
 3476|   320k|    u[26] = half_btf_avx2(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
 3477|   320k|    u[29] = half_btf_avx2(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
 3478|   320k|    u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
 3479|   320k|    u[17] = tmp1;
 3480|   320k|    u[18] = tmp2;
 3481|   320k|    u[21] = tmp3;
 3482|   320k|    u[22] = tmp4;
 3483|       |
 3484|  1.60M|    for (i = 32; i < 64; i += 8) {
  ------------------
  |  Branch (3484:18): [True: 1.28M, False: 320k]
  ------------------
 3485|  1.28M|      addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
 3486|  1.28M|                  &clamp_hi);
 3487|  1.28M|      addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
 3488|  1.28M|                  &clamp_hi);
 3489|       |
 3490|  1.28M|      addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
 3491|  1.28M|                  &clamp_hi);
 3492|  1.28M|      addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
 3493|  1.28M|                  &clamp_hi);
 3494|  1.28M|    }
 3495|       |
 3496|       |    // stage 6
 3497|   320k|    tmp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
 3498|   320k|    u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
 3499|   320k|    u[0] = tmp1;
 3500|   320k|    u[5] = u[4];
 3501|   320k|    u[6] = u[7];
 3502|       |
 3503|   320k|    tmp1 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
 3504|   320k|    u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
 3505|   320k|    u[9] = tmp1;
 3506|   320k|    tmp2 = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
 3507|   320k|    u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
 3508|   320k|    u[10] = tmp2;
 3509|       |
 3510|   962k|    for (i = 16; i < 32; i += 8) {
  ------------------
  |  Branch (3510:18): [True: 642k, False: 320k]
  ------------------
 3511|   642k|      addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
 3512|   642k|                  &clamp_hi);
 3513|   642k|      addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
 3514|   642k|                  &clamp_hi);
 3515|       |
 3516|   642k|      addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
 3517|   642k|                  &clamp_hi);
 3518|   642k|      addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
 3519|   642k|                  &clamp_hi);
 3520|   642k|    }
 3521|       |
 3522|   320k|    tmp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
 3523|   320k|    tmp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
 3524|   320k|    tmp3 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
 3525|   320k|    tmp4 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
 3526|   320k|    u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
 3527|   320k|    u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
 3528|   320k|    u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
 3529|   320k|    u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
 3530|   320k|    u[34] = tmp1;
 3531|   320k|    u[35] = tmp2;
 3532|   320k|    u[36] = tmp3;
 3533|   320k|    u[37] = tmp4;
 3534|       |
 3535|   320k|    tmp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
 3536|   320k|    tmp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
 3537|   320k|    tmp3 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
 3538|   320k|    tmp4 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
 3539|   320k|    u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
 3540|   320k|    u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
 3541|   320k|    u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
 3542|   320k|    u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
 3543|   320k|    u[42] = tmp1;
 3544|   320k|    u[43] = tmp2;
 3545|   320k|    u[44] = tmp3;
 3546|   320k|    u[45] = tmp4;
 3547|       |
 3548|       |    // stage 7
 3549|   320k|    u[3] = u[0];
 3550|   320k|    u[2] = u[1];
 3551|   320k|    tmp1 = half_btf_avx2(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
 3552|   320k|    u[6] = half_btf_avx2(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
 3553|   320k|    u[5] = tmp1;
 3554|   320k|    addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
 3555|   320k|    addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
 3556|   320k|    addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
 3557|   320k|    addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
 3558|       |
 3559|   320k|    tmp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
 3560|   320k|    tmp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
 3561|   320k|    tmp3 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
 3562|   320k|    tmp4 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
 3563|   320k|    u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
 3564|   320k|    u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
 3565|   320k|    u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
 3566|   320k|    u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
 3567|   320k|    u[18] = tmp1;
 3568|   320k|    u[19] = tmp2;
 3569|   320k|    u[20] = tmp3;
 3570|   320k|    u[21] = tmp4;
 3571|       |
 3572|   962k|    for (i = 32; i < 64; i += 16) {
  ------------------
  |  Branch (3572:18): [True: 642k, False: 320k]
  ------------------
 3573|  3.21M|      for (j = i; j < i + 4; j++) {
  ------------------
  |  Branch (3573:19): [True: 2.56M, False: 642k]
  ------------------
 3574|  2.56M|        addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
 3575|  2.56M|        addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
 3576|  2.56M|                    &clamp_hi);
 3577|  2.56M|      }
 3578|   642k|    }
 3579|       |
 3580|       |    // stage 8
 3581|  1.60M|    for (i = 0; i < 4; ++i) {
  ------------------
  |  Branch (3581:17): [True: 1.28M, False: 320k]
  ------------------
 3582|  1.28M|      addsub_avx2(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
 3583|  1.28M|    }
 3584|       |
 3585|   320k|    idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
 3586|   320k|                       &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
 3587|       |
 3588|       |    // stage 9
 3589|   320k|    idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
 3590|   320k|                       bit);
 3591|       |
 3592|       |    // stage 10
 3593|   320k|    idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
 3594|   320k|                        bit);
 3595|       |
 3596|       |    // stage 11
 3597|   320k|    idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
 3598|   320k|  }
 3599|   320k|}
highbd_inv_txfm_avx2.c:idct64_avx2:
 3601|   268k|                        int out_shift) {
 3602|   268k|  int i, j;
 3603|   268k|  const int32_t *cospi = cospi_arr(bit);
 3604|   268k|  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
 3605|   268k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|   537k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 268k]
  |  |  |  Branch (35:31): [True: 153k, False: 115k]
  |  |  |  Branch (35:44): [True: 153k, False: 115k]
  |  |  ------------------
  ------------------
 3606|   268k|  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
 3607|   268k|  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
 3608|       |
 3609|   268k|  const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
 3610|   268k|  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
 3611|   268k|  const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
 3612|   268k|  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
 3613|   268k|  const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
 3614|   268k|  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
 3615|   268k|  const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
 3616|   268k|  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
 3617|   268k|  const __m256i cospi9 = _mm256_set1_epi32(cospi[9]);
 3618|   268k|  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
 3619|   268k|  const __m256i cospi11 = _mm256_set1_epi32(cospi[11]);
 3620|   268k|  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
 3621|   268k|  const __m256i cospi13 = _mm256_set1_epi32(cospi[13]);
 3622|   268k|  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
 3623|   268k|  const __m256i cospi15 = _mm256_set1_epi32(cospi[15]);
 3624|   268k|  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
 3625|   268k|  const __m256i cospi17 = _mm256_set1_epi32(cospi[17]);
 3626|   268k|  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
 3627|   268k|  const __m256i cospi19 = _mm256_set1_epi32(cospi[19]);
 3628|   268k|  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
 3629|   268k|  const __m256i cospi21 = _mm256_set1_epi32(cospi[21]);
 3630|   268k|  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
 3631|   268k|  const __m256i cospi23 = _mm256_set1_epi32(cospi[23]);
 3632|   268k|  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
 3633|   268k|  const __m256i cospi25 = _mm256_set1_epi32(cospi[25]);
 3634|   268k|  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
 3635|   268k|  const __m256i cospi27 = _mm256_set1_epi32(cospi[27]);
 3636|   268k|  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
 3637|   268k|  const __m256i cospi29 = _mm256_set1_epi32(cospi[29]);
 3638|   268k|  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
 3639|   268k|  const __m256i cospi31 = _mm256_set1_epi32(cospi[31]);
 3640|   268k|  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
 3641|   268k|  const __m256i cospi35 = _mm256_set1_epi32(cospi[35]);
 3642|   268k|  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
 3643|   268k|  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
 3644|   268k|  const __m256i cospi39 = _mm256_set1_epi32(cospi[39]);
 3645|   268k|  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
 3646|   268k|  const __m256i cospi43 = _mm256_set1_epi32(cospi[43]);
 3647|   268k|  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
 3648|   268k|  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
 3649|   268k|  const __m256i cospi47 = _mm256_set1_epi32(cospi[47]);
 3650|   268k|  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
 3651|   268k|  const __m256i cospi51 = _mm256_set1_epi32(cospi[51]);
 3652|   268k|  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
 3653|   268k|  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
 3654|   268k|  const __m256i cospi55 = _mm256_set1_epi32(cospi[55]);
 3655|   268k|  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
 3656|   268k|  const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
 3657|   268k|  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
 3658|   268k|  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
 3659|   268k|  const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
 3660|       |
 3661|   268k|  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
 3662|   268k|  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
 3663|   268k|  const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
 3664|   268k|  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
 3665|   268k|  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
 3666|   268k|  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
 3667|   268k|  const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
 3668|   268k|  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
 3669|   268k|  const __m256i cospim33 = _mm256_set1_epi32(-cospi[33]);
 3670|   268k|  const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
 3671|   268k|  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
 3672|   268k|  const __m256i cospim37 = _mm256_set1_epi32(-cospi[37]);
 3673|   268k|  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
 3674|   268k|  const __m256i cospim41 = _mm256_set1_epi32(-cospi[41]);
 3675|   268k|  const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
 3676|   268k|  const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]);
 3677|   268k|  const __m256i cospim45 = _mm256_set1_epi32(-cospi[45]);
 3678|   268k|  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
 3679|   268k|  const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]);
 3680|   268k|  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
 3681|   268k|  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
 3682|   268k|  const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]);
 3683|   268k|  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
 3684|   268k|  const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
 3685|   268k|  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
 3686|   268k|  const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]);
 3687|   268k|  const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
 3688|       |
 3689|   268k|  {
 3690|   268k|    __m256i u[64], v[64];
 3691|       |
 3692|       |    // stage 1
 3693|   268k|    u[32] = in[1];
 3694|   268k|    u[34] = in[17];
 3695|   268k|    u[36] = in[9];
 3696|   268k|    u[38] = in[25];
 3697|   268k|    u[40] = in[5];
 3698|   268k|    u[42] = in[21];
 3699|   268k|    u[44] = in[13];
 3700|   268k|    u[46] = in[29];
 3701|   268k|    u[48] = in[3];
 3702|   268k|    u[50] = in[19];
 3703|   268k|    u[52] = in[11];
 3704|   268k|    u[54] = in[27];
 3705|   268k|    u[56] = in[7];
 3706|   268k|    u[58] = in[23];
 3707|   268k|    u[60] = in[15];
 3708|   268k|    u[62] = in[31];
 3709|       |
 3710|   268k|    v[16] = in[2];
 3711|   268k|    v[18] = in[18];
 3712|   268k|    v[20] = in[10];
 3713|   268k|    v[22] = in[26];
 3714|   268k|    v[24] = in[6];
 3715|   268k|    v[26] = in[22];
 3716|   268k|    v[28] = in[14];
 3717|   268k|    v[30] = in[30];
 3718|       |
 3719|   268k|    u[8] = in[4];
 3720|   268k|    u[10] = in[20];
 3721|   268k|    u[12] = in[12];
 3722|   268k|    u[14] = in[28];
 3723|       |
 3724|   268k|    v[4] = in[8];
 3725|   268k|    v[6] = in[24];
 3726|       |
 3727|   268k|    u[0] = in[0];
 3728|   268k|    u[2] = in[16];
 3729|       |
 3730|       |    // stage 2
 3731|   268k|    v[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
 3732|   268k|    v[33] = half_btf_0_avx2(&cospim33, &u[62], &rnding, bit);
 3733|   268k|    v[34] = half_btf_0_avx2(&cospi47, &u[34], &rnding, bit);
 3734|   268k|    v[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit);
 3735|   268k|    v[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit);
 3736|   268k|    v[37] = half_btf_0_avx2(&cospim41, &u[58], &rnding, bit);
 3737|   268k|    v[38] = half_btf_0_avx2(&cospi39, &u[38], &rnding, bit);
 3738|   268k|    v[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
 3739|   268k|    v[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
 3740|   268k|    v[41] = half_btf_0_avx2(&cospim37, &u[54], &rnding, bit);
 3741|   268k|    v[42] = half_btf_0_avx2(&cospi43, &u[42], &rnding, bit);
 3742|   268k|    v[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit);
 3743|   268k|    v[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit);
 3744|   268k|    v[45] = half_btf_0_avx2(&cospim45, &u[50], &rnding, bit);
 3745|   268k|    v[46] = half_btf_0_avx2(&cospi35, &u[46], &rnding, bit);
 3746|   268k|    v[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
 3747|   268k|    v[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
 3748|   268k|    v[49] = half_btf_0_avx2(&cospi29, &u[46], &rnding, bit);
 3749|   268k|    v[50] = half_btf_0_avx2(&cospi19, &u[50], &rnding, bit);
 3750|   268k|    v[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit);
 3751|   268k|    v[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit);
 3752|   268k|    v[53] = half_btf_0_avx2(&cospi21, &u[42], &rnding, bit);
 3753|   268k|    v[54] = half_btf_0_avx2(&cospi27, &u[54], &rnding, bit);
 3754|   268k|    v[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
 3755|   268k|    v[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
 3756|   268k|    v[57] = half_btf_0_avx2(&cospi25, &u[38], &rnding, bit);
 3757|   268k|    v[58] = half_btf_0_avx2(&cospi23, &u[58], &rnding, bit);
 3758|   268k|    v[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit);
 3759|   268k|    v[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit);
 3760|   268k|    v[61] = half_btf_0_avx2(&cospi17, &u[34], &rnding, bit);
 3761|   268k|    v[62] = half_btf_0_avx2(&cospi31, &u[62], &rnding, bit);
 3762|   268k|    v[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
 3763|       |
 3764|       |    // stage 3
 3765|   268k|    u[16] = half_btf_0_avx2(&cospi62, &v[16], &rnding, bit);
 3766|   268k|    u[17] = half_btf_0_avx2(&cospim34, &v[30], &rnding, bit);
 3767|   268k|    u[18] = half_btf_0_avx2(&cospi46, &v[18], &rnding, bit);
 3768|   268k|    u[19] = half_btf_0_avx2(&cospim50, &v[28], &rnding, bit);
 3769|   268k|    u[20] = half_btf_0_avx2(&cospi54, &v[20], &rnding, bit);
 3770|   268k|    u[21] = half_btf_0_avx2(&cospim42, &v[26], &rnding, bit);
 3771|   268k|    u[22] = half_btf_0_avx2(&cospi38, &v[22], &rnding, bit);
 3772|   268k|    u[23] = half_btf_0_avx2(&cospim58, &v[24], &rnding, bit);
 3773|   268k|    u[24] = half_btf_0_avx2(&cospi6, &v[24], &rnding, bit);
 3774|   268k|    u[25] = half_btf_0_avx2(&cospi26, &v[22], &rnding, bit);
 3775|   268k|    u[26] = half_btf_0_avx2(&cospi22, &v[26], &rnding, bit);
 3776|   268k|    u[27] = half_btf_0_avx2(&cospi10, &v[20], &rnding, bit);
 3777|   268k|    u[28] = half_btf_0_avx2(&cospi14, &v[28], &rnding, bit);
 3778|   268k|    u[29] = half_btf_0_avx2(&cospi18, &v[18], &rnding, bit);
 3779|   268k|    u[30] = half_btf_0_avx2(&cospi30, &v[30], &rnding, bit);
 3780|   268k|    u[31] = half_btf_0_avx2(&cospi2, &v[16], &rnding, bit);
 3781|       |
 3782|  2.41M|    for (i = 32; i < 64; i += 4) {
  ------------------
  |  Branch (3782:18): [True: 2.15M, False: 268k]
  ------------------
 3783|  2.15M|      addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
 3784|  2.15M|                  &clamp_hi);
 3785|  2.15M|      addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
 3786|  2.15M|                  &clamp_hi);
 3787|  2.15M|    }
 3788|       |
 3789|       |    // stage 4
 3790|   268k|    v[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
 3791|   268k|    v[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit);
 3792|   268k|    v[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit);
 3793|   268k|    v[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
 3794|   268k|    v[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
 3795|   268k|    v[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit);
 3796|   268k|    v[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit);
 3797|   268k|    v[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
 3798|       |
 3799|  1.34M|    for (i = 16; i < 32; i += 4) {
  ------------------
  |  Branch (3799:18): [True: 1.07M, False: 268k]
  ------------------
 3800|  1.07M|      addsub_avx2(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
 3801|  1.07M|                  &clamp_hi);
 3802|  1.07M|      addsub_avx2(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
 3803|  1.07M|                  &clamp_hi);
 3804|  1.07M|    }
 3805|       |
 3806|  2.41M|    for (i = 32; i < 64; i += 4) {
  ------------------
  |  Branch (3806:18): [True: 2.15M, False: 268k]
  ------------------
 3807|  2.15M|      v[i + 0] = u[i + 0];
 3808|  2.15M|      v[i + 3] = u[i + 3];
 3809|  2.15M|    }
 3810|       |
 3811|   268k|    v[33] = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
 3812|   268k|    v[34] = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
 3813|   268k|    v[37] = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
 3814|   268k|    v[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
 3815|   268k|    v[41] = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
 3816|   268k|    v[42] = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
 3817|   268k|    v[45] = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
 3818|   268k|    v[46] = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
 3819|   268k|    v[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
 3820|   268k|    v[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
 3821|   268k|    v[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
 3822|   268k|    v[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
 3823|   268k|    v[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
 3824|   268k|    v[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
 3825|   268k|    v[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
 3826|   268k|    v[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
 3827|       |
 3828|       |    // stage 5
 3829|   268k|    u[4] = half_btf_0_avx2(&cospi56, &v[4], &rnding, bit);
 3830|   268k|    u[5] = half_btf_0_avx2(&cospim40, &v[6], &rnding, bit);
 3831|   268k|    u[6] = half_btf_0_avx2(&cospi24, &v[6], &rnding, bit);
 3832|   268k|    u[7] = half_btf_0_avx2(&cospi8, &v[4], &rnding, bit);
 3833|       |
 3834|   806k|    for (i = 8; i < 16; i += 4) {
  ------------------
  |  Branch (3834:17): [True: 537k, False: 268k]
  ------------------
 3835|   537k|      addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
 3836|   537k|                  &clamp_hi);
 3837|   537k|      addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
 3838|   537k|                  &clamp_hi);
 3839|   537k|    }
 3840|       |
 3841|  1.34M|    for (i = 16; i < 32; i += 4) {
  ------------------
  |  Branch (3841:18): [True: 1.07M, False: 268k]
  ------------------
 3842|  1.07M|      u[i + 0] = v[i + 0];
 3843|  1.07M|      u[i + 3] = v[i + 3];
 3844|  1.07M|    }
 3845|       |
 3846|   268k|    u[17] = half_btf_avx2(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
 3847|   268k|    u[18] = half_btf_avx2(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
 3848|   268k|    u[21] = half_btf_avx2(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
 3849|   268k|    u[22] = half_btf_avx2(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
 3850|   268k|    u[25] = half_btf_avx2(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
 3851|   268k|    u[26] = half_btf_avx2(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
 3852|   268k|    u[29] = half_btf_avx2(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
 3853|   268k|    u[30] = half_btf_avx2(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
 3854|       |
 3855|  1.34M|    for (i = 32; i < 64; i += 8) {
  ------------------
  |  Branch (3855:18): [True: 1.07M, False: 268k]
  ------------------
 3856|  1.07M|      addsub_avx2(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
 3857|  1.07M|                  &clamp_hi);
 3858|  1.07M|      addsub_avx2(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
 3859|  1.07M|                  &clamp_hi);
 3860|       |
 3861|  1.07M|      addsub_avx2(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
 3862|  1.07M|                  &clamp_hi);
 3863|  1.07M|      addsub_avx2(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
 3864|  1.07M|                  &clamp_hi);
 3865|  1.07M|    }
 3866|       |
 3867|       |    // stage 6
 3868|   268k|    v[0] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
 3869|   268k|    v[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
 3870|   268k|    v[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit);
 3871|   268k|    v[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit);
 3872|       |
 3873|   268k|    addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
 3874|   268k|    addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
 3875|       |
 3876|   806k|    for (i = 8; i < 16; i += 4) {
  ------------------
  |  Branch (3876:17): [True: 537k, False: 268k]
  ------------------
 3877|   537k|      v[i + 0] = u[i + 0];
 3878|   537k|      v[i + 3] = u[i + 3];
 3879|   537k|    }
 3880|       |
 3881|   268k|    v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
 3882|   268k|    v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
 3883|   268k|    v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
 3884|   268k|    v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
 3885|       |
 3886|   806k|    for (i = 16; i < 32; i += 8) {
  ------------------
  |  Branch (3886:18): [True: 537k, False: 268k]
  ------------------
 3887|   537k|      addsub_avx2(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
 3888|   537k|                  &clamp_hi);
 3889|   537k|      addsub_avx2(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
 3890|   537k|                  &clamp_hi);
 3891|       |
 3892|   537k|      addsub_avx2(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
 3893|   537k|                  &clamp_hi);
 3894|   537k|      addsub_avx2(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
 3895|   537k|                  &clamp_hi);
 3896|   537k|    }
 3897|       |
 3898|  1.34M|    for (i = 32; i < 64; i += 8) {
  ------------------
  |  Branch (3898:18): [True: 1.07M, False: 268k]
  ------------------
 3899|  1.07M|      v[i + 0] = u[i + 0];
 3900|  1.07M|      v[i + 1] = u[i + 1];
 3901|  1.07M|      v[i + 6] = u[i + 6];
 3902|  1.07M|      v[i + 7] = u[i + 7];
 3903|  1.07M|    }
 3904|       |
 3905|   268k|    v[34] = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
 3906|   268k|    v[35] = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
 3907|   268k|    v[36] = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
 3908|   268k|    v[37] = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
 3909|   268k|    v[42] = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
 3910|   268k|    v[43] = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
 3911|   268k|    v[44] = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
 3912|   268k|    v[45] = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
 3913|   268k|    v[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
 3914|   268k|    v[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
 3915|   268k|    v[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
 3916|   268k|    v[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
 3917|   268k|    v[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
 3918|   268k|    v[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
 3919|   268k|    v[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
 3920|   268k|    v[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
 3921|       |
 3922|       |    // stage 7
 3923|   268k|    addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
 3924|   268k|    addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
 3925|       |
 3926|   268k|    u[4] = v[4];
 3927|   268k|    u[7] = v[7];
 3928|   268k|    u[5] = half_btf_avx2(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
 3929|   268k|    u[6] = half_btf_avx2(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
 3930|       |
 3931|   268k|    addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
 3932|   268k|    addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
 3933|   268k|    addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
 3934|   268k|    addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
 3935|       |
 3936|   806k|    for (i = 16; i < 32; i += 8) {
  ------------------
  |  Branch (3936:18): [True: 537k, False: 268k]
  ------------------
 3937|   537k|      u[i + 0] = v[i + 0];
 3938|   537k|      u[i + 1] = v[i + 1];
 3939|   537k|      u[i + 6] = v[i + 6];
 3940|   537k|      u[i + 7] = v[i + 7];
 3941|   537k|    }
 3942|       |
 3943|   268k|    u[18] = half_btf_avx2(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
 3944|   268k|    u[19] = half_btf_avx2(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
 3945|   268k|    u[20] = half_btf_avx2(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
 3946|   268k|    u[21] = half_btf_avx2(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
 3947|   268k|    u[26] = half_btf_avx2(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
 3948|   268k|    u[27] = half_btf_avx2(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
 3949|   268k|    u[28] = half_btf_avx2(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
 3950|   268k|    u[29] = half_btf_avx2(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
 3951|       |
 3952|   806k|    for (i = 32; i < 64; i += 16) {
  ------------------
  |  Branch (3952:18): [True: 537k, False: 268k]
  ------------------
 3953|  2.68M|      for (j = i; j < i + 4; j++) {
  ------------------
  |  Branch (3953:19): [True: 2.15M, False: 537k]
  ------------------
 3954|  2.15M|        addsub_avx2(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
 3955|  2.15M|        addsub_avx2(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
 3956|  2.15M|                    &clamp_hi);
 3957|  2.15M|      }
 3958|   537k|    }
 3959|       |
 3960|       |    // stage 8
 3961|  1.34M|    for (i = 0; i < 4; ++i) {
  ------------------
  |  Branch (3961:17): [True: 1.07M, False: 268k]
  ------------------
 3962|  1.07M|      addsub_avx2(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
 3963|  1.07M|    }
 3964|       |
 3965|   268k|    v[8] = u[8];
 3966|   268k|    v[9] = u[9];
 3967|   268k|    v[14] = u[14];
 3968|   268k|    v[15] = u[15];
 3969|       |
 3970|   268k|    v[10] = half_btf_avx2(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
 3971|   268k|    v[11] = half_btf_avx2(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
 3972|   268k|    v[12] = half_btf_avx2(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
 3973|   268k|    v[13] = half_btf_avx2(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
 3974|       |
 3975|  1.34M|    for (i = 16; i < 20; ++i) {
  ------------------
  |  Branch (3975:18): [True: 1.07M, False: 268k]
  ------------------
 3976|  1.07M|      addsub_avx2(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
 3977|  1.07M|      addsub_avx2(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
 3978|  1.07M|                  &clamp_hi);
 3979|  1.07M|    }
 3980|       |
 3981|  1.34M|    for (i = 32; i < 36; ++i) {
  ------------------
  |  Branch (3981:18): [True: 1.07M, False: 268k]
  ------------------
 3982|  1.07M|      v[i] = u[i];
 3983|  1.07M|      v[i + 12] = u[i + 12];
 3984|  1.07M|      v[i + 16] = u[i + 16];
 3985|  1.07M|      v[i + 28] = u[i + 28];
 3986|  1.07M|    }
 3987|       |
 3988|   268k|    v[36] = half_btf_avx2(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
 3989|   268k|    v[37] = half_btf_avx2(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
 3990|   268k|    v[38] = half_btf_avx2(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
 3991|   268k|    v[39] = half_btf_avx2(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
 3992|   268k|    v[40] = half_btf_avx2(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
 3993|   268k|    v[41] = half_btf_avx2(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
 3994|   268k|    v[42] = half_btf_avx2(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
 3995|   268k|    v[43] = half_btf_avx2(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
 3996|   268k|    v[52] = half_btf_avx2(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
 3997|   268k|    v[53] = half_btf_avx2(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
 3998|   268k|    v[54] = half_btf_avx2(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
 3999|   268k|    v[55] = half_btf_avx2(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
 4000|   268k|    v[56] = half_btf_avx2(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
 4001|   268k|    v[57] = half_btf_avx2(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
 4002|   268k|    v[58] = half_btf_avx2(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
 4003|   268k|    v[59] = half_btf_avx2(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
 4004|       |
 4005|       |    // stage 9
 4006|  2.41M|    for (i = 0; i < 8; ++i) {
  ------------------
  |  Branch (4006:17): [True: 2.15M, False: 268k]
  ------------------
 4007|  2.15M|      addsub_avx2(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
 4008|  2.15M|    }
 4009|       |
 4010|  1.34M|    for (i = 16; i < 20; ++i) {
  ------------------
  |  Branch (4010:18): [True: 1.07M, False: 268k]
  ------------------
 4011|  1.07M|      u[i] = v[i];
 4012|  1.07M|      u[i + 12] = v[i + 12];
 4013|  1.07M|    }
 4014|       |
 4015|   268k|    u[20] = half_btf_avx2(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
 4016|   268k|    u[21] = half_btf_avx2(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
 4017|   268k|    u[22] = half_btf_avx2(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
 4018|   268k|    u[23] = half_btf_avx2(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
 4019|   268k|    u[24] = half_btf_avx2(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
 4020|   268k|    u[25] = half_btf_avx2(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
 4021|   268k|    u[26] = half_btf_avx2(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
 4022|   268k|    u[27] = half_btf_avx2(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
 4023|       |
 4024|  2.41M|    for (i = 32; i < 40; i++) {
  ------------------
  |  Branch (4024:18): [True: 2.15M, False: 268k]
  ------------------
 4025|  2.15M|      addsub_avx2(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
 4026|  2.15M|    }
 4027|       |
 4028|  2.41M|    for (i = 48; i < 56; i++) {
  ------------------
  |  Branch (4028:18): [True: 2.15M, False: 268k]
  ------------------
 4029|  2.15M|      addsub_avx2(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
 4030|  2.15M|    }
 4031|       |
 4032|       |    // stage 10
 4033|  4.56M|    for (i = 0; i < 16; i++) {
  ------------------
  |  Branch (4033:17): [True: 4.30M, False: 268k]
  ------------------
 4034|  4.30M|      addsub_avx2(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
 4035|  4.30M|    }
 4036|       |
 4037|  2.41M|    for (i = 32; i < 40; i++) v[i] = u[i];
  ------------------
  |  Branch (4037:18): [True: 2.15M, False: 268k]
  ------------------
 4038|       |
 4039|   268k|    v[40] = half_btf_avx2(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
 4040|   268k|    v[41] = half_btf_avx2(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
 4041|   268k|    v[42] = half_btf_avx2(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
 4042|   268k|    v[43] = half_btf_avx2(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
 4043|   268k|    v[44] = half_btf_avx2(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
 4044|   268k|    v[45] = half_btf_avx2(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
 4045|   268k|    v[46] = half_btf_avx2(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
 4046|   268k|    v[47] = half_btf_avx2(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
 4047|   268k|    v[48] = half_btf_avx2(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
 4048|   268k|    v[49] = half_btf_avx2(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
 4049|   268k|    v[50] = half_btf_avx2(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
 4050|   268k|    v[51] = half_btf_avx2(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
 4051|   268k|    v[52] = half_btf_avx2(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
 4052|   268k|    v[53] = half_btf_avx2(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
 4053|   268k|    v[54] = half_btf_avx2(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
 4054|   268k|    v[55] = half_btf_avx2(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
 4055|       |
 4056|  2.41M|    for (i = 56; i < 64; i++) v[i] = u[i];
  ------------------
  |  Branch (4056:18): [True: 2.15M, False: 268k]
  ------------------
 4057|       |
 4058|       |    // stage 11
 4059|  8.86M|    for (i = 0; i < 32; i++) {
  ------------------
  |  Branch (4059:17): [True: 8.60M, False: 268k]
  ------------------
 4060|  8.60M|      addsub_avx2(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo,
 4061|  8.60M|                  &clamp_hi);
 4062|  8.60M|    }
 4063|   268k|    if (!do_cols) {
  ------------------
  |  Branch (4063:9): [True: 115k, False: 153k]
  ------------------
 4064|   115k|      const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|   115k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 115k]
  |  |  ------------------
  ------------------
 4065|   115k|      const __m256i clamp_lo_out =
 4066|   115k|          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
 4067|   115k|      const __m256i clamp_hi_out =
 4068|   115k|          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
 4069|       |
 4070|   115k|      round_shift_8x8_avx2(out, out_shift);
 4071|   115k|      round_shift_8x8_avx2(out + 16, out_shift);
 4072|   115k|      round_shift_8x8_avx2(out + 32, out_shift);
 4073|   115k|      round_shift_8x8_avx2(out + 48, out_shift);
 4074|   115k|      highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64);
 4075|   115k|    }
 4076|   268k|  }
 4077|   268k|}
highbd_inv_txfm_avx2.c:load_buffer_32bit_input:
  235|  5.15M|                                           __m256i *out, int out_size) {
  236|  59.6M|  for (int i = 0; i < out_size; ++i) {
  ------------------
  |  Branch (236:19): [True: 54.4M, False: 5.15M]
  ------------------
  237|  54.4M|    out[i] = _mm256_loadu_si256((const __m256i *)(in + i * stride));
  238|  54.4M|  }
  239|  5.15M|}
highbd_inv_txfm_avx2.c:transpose_8x8_flip_avx2:
  197|   224k|static void transpose_8x8_flip_avx2(const __m256i *in, __m256i *out) {
  198|   224k|  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
  199|   224k|  __m256i x0, x1;
  200|       |
  201|   224k|  u0 = _mm256_unpacklo_epi32(in[7], in[6]);
  202|   224k|  u1 = _mm256_unpackhi_epi32(in[7], in[6]);
  203|       |
  204|   224k|  u2 = _mm256_unpacklo_epi32(in[5], in[4]);
  205|   224k|  u3 = _mm256_unpackhi_epi32(in[5], in[4]);
  206|       |
  207|   224k|  u4 = _mm256_unpacklo_epi32(in[3], in[2]);
  208|   224k|  u5 = _mm256_unpackhi_epi32(in[3], in[2]);
  209|       |
  210|   224k|  u6 = _mm256_unpacklo_epi32(in[1], in[0]);
  211|   224k|  u7 = _mm256_unpackhi_epi32(in[1], in[0]);
  212|       |
  213|   224k|  x0 = _mm256_unpacklo_epi64(u0, u2);
  214|   224k|  x1 = _mm256_unpacklo_epi64(u4, u6);
  215|   224k|  out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
  216|   224k|  out[4] = _mm256_permute2f128_si256(x0, x1, 0x31);
  217|       |
  218|   224k|  x0 = _mm256_unpackhi_epi64(u0, u2);
  219|   224k|  x1 = _mm256_unpackhi_epi64(u4, u6);
  220|   224k|  out[1] = _mm256_permute2f128_si256(x0, x1, 0x20);
  221|   224k|  out[5] = _mm256_permute2f128_si256(x0, x1, 0x31);
  222|       |
  223|   224k|  x0 = _mm256_unpacklo_epi64(u1, u3);
  224|   224k|  x1 = _mm256_unpacklo_epi64(u5, u7);
  225|   224k|  out[2] = _mm256_permute2f128_si256(x0, x1, 0x20);
  226|   224k|  out[6] = _mm256_permute2f128_si256(x0, x1, 0x31);
  227|       |
  228|   224k|  x0 = _mm256_unpackhi_epi64(u1, u3);
  229|   224k|  x1 = _mm256_unpackhi_epi64(u5, u7);
  230|   224k|  out[3] = _mm256_permute2f128_si256(x0, x1, 0x20);
  231|   224k|  out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
  232|   224k|}
highbd_inv_txfm_avx2.c:transpose_8x8_avx2:
  160|  13.5M|static void transpose_8x8_avx2(const __m256i *in, __m256i *out) {
  161|  13.5M|  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
  162|  13.5M|  __m256i x0, x1;
  163|       |
  164|  13.5M|  u0 = _mm256_unpacklo_epi32(in[0], in[1]);
  165|  13.5M|  u1 = _mm256_unpackhi_epi32(in[0], in[1]);
  166|       |
  167|  13.5M|  u2 = _mm256_unpacklo_epi32(in[2], in[3]);
  168|  13.5M|  u3 = _mm256_unpackhi_epi32(in[2], in[3]);
  169|       |
  170|  13.5M|  u4 = _mm256_unpacklo_epi32(in[4], in[5]);
  171|  13.5M|  u5 = _mm256_unpackhi_epi32(in[4], in[5]);
  172|       |
  173|  13.5M|  u6 = _mm256_unpacklo_epi32(in[6], in[7]);
  174|  13.5M|  u7 = _mm256_unpackhi_epi32(in[6], in[7]);
  175|       |
  176|  13.5M|  x0 = _mm256_unpacklo_epi64(u0, u2);
  177|  13.5M|  x1 = _mm256_unpacklo_epi64(u4, u6);
  178|  13.5M|  out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
  179|  13.5M|  out[4] = _mm256_permute2f128_si256(x0, x1, 0x31);
  180|       |
  181|  13.5M|  x0 = _mm256_unpackhi_epi64(u0, u2);
  182|  13.5M|  x1 = _mm256_unpackhi_epi64(u4, u6);
  183|  13.5M|  out[1] = _mm256_permute2f128_si256(x0, x1, 0x20);
  184|  13.5M|  out[5] = _mm256_permute2f128_si256(x0, x1, 0x31);
  185|       |
  186|  13.5M|  x0 = _mm256_unpacklo_epi64(u1, u3);
  187|  13.5M|  x1 = _mm256_unpacklo_epi64(u5, u7);
  188|  13.5M|  out[2] = _mm256_permute2f128_si256(x0, x1, 0x20);
  189|  13.5M|  out[6] = _mm256_permute2f128_si256(x0, x1, 0x31);
  190|       |
  191|  13.5M|  x0 = _mm256_unpackhi_epi64(u1, u3);
  192|  13.5M|  x1 = _mm256_unpackhi_epi64(u5, u7);
  193|  13.5M|  out[3] = _mm256_permute2f128_si256(x0, x1, 0x20);
  194|  13.5M|  out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
  195|  13.5M|}
highbd_inv_txfm_avx2.c:highbd_write_buffer_16xn_avx2:
  107|  4.88M|                                                 int height, const int bd) {
  108|  4.88M|  int j = flipud ? (height - 1) : 0;
  ------------------
  |  Branch (108:11): [True: 58.5k, False: 4.83M]
  ------------------
  109|  4.88M|  const int step = flipud ? -1 : 1;
  ------------------
  |  Branch (109:20): [True: 58.5k, False: 4.83M]
  ------------------
  110|   107M|  for (int i = 0; i < height; ++i, j += step) {
  ------------------
  |  Branch (110:19): [True: 102M, False: 4.88M]
  ------------------
  111|   102M|    __m256i v = _mm256_loadu_si256((__m256i const *)(output + i * stride));
  112|   102M|    __m256i u = highbd_get_recon_16x8_avx2(v, in[j], in[j + height], bd);
  113|       |
  114|   102M|    _mm256_storeu_si256((__m256i *)(output + i * stride), u);
  115|   102M|  }
  116|  4.88M|}
highbd_inv_txfm_avx2.c:highbd_get_recon_16x8_avx2:
   93|   102M|                                                 const int bd) {
   94|   102M|  __m256i x0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(pred));
   95|   102M|  __m256i x1 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(pred, 1));
   96|       |
   97|   102M|  x0 = _mm256_add_epi32(res0, x0);
   98|   102M|  x1 = _mm256_add_epi32(res1, x1);
   99|   102M|  x0 = _mm256_packus_epi32(x0, x1);
  100|   102M|  x0 = _mm256_permute4x64_epi64(x0, 0xd8);
  101|   102M|  x0 = highbd_clamp_epi16_avx2(x0, bd);
  102|   102M|  return x0;
  103|   102M|}
highbd_inv_txfm_avx2.c:highbd_clamp_epi16_avx2:
   34|   120M|static inline __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) {
   35|   120M|  const __m256i zero = _mm256_setzero_si256();
   36|   120M|  const __m256i one = _mm256_set1_epi16(1);
   37|   120M|  const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
   38|   120M|  __m256i clamped, mask;
   39|       |
   40|   120M|  mask = _mm256_cmpgt_epi16(u, max);
   41|   120M|  clamped = _mm256_andnot_si256(mask, u);
   42|   120M|  mask = _mm256_and_si256(mask, max);
   43|   120M|  clamped = _mm256_or_si256(mask, clamped);
   44|   120M|  mask = _mm256_cmpgt_epi16(clamped, zero);
   45|   120M|  clamped = _mm256_and_si256(clamped, mask);
   46|       |
   47|   120M|  return clamped;
   48|   120M|}
highbd_inv_txfm_avx2.c:highbd_write_buffer_8xn_avx2:
  129|  1.68M|                                                int height, const int bd) {
  130|  1.68M|  int j = flipud ? (height - 1) : 0;
  ------------------
  |  Branch (130:11): [True: 74.7k, False: 1.61M]
  ------------------
  131|  1.68M|  __m128i temp;
  132|  1.68M|  const int step = flipud ? -1 : 1;
  ------------------
  |  Branch (132:20): [True: 74.7k, False: 1.61M]
  ------------------
  133|  19.4M|  for (int i = 0; i < height; ++i, j += step) {
  ------------------
  |  Branch (133:19): [True: 17.7M, False: 1.68M]
  ------------------
  134|  17.7M|    temp = _mm_loadu_si128((__m128i const *)(output + i * stride));
  135|  17.7M|    __m256i v = _mm256_cvtepi16_epi32(temp);
  136|  17.7M|    __m256i u = highbd_get_recon_8x8_avx2(v, in[j], bd);
  137|  17.7M|    __m128i u1 = _mm256_castsi256_si128(u);
  138|  17.7M|    _mm_storeu_si128((__m128i *)(output + i * stride), u1);
  139|  17.7M|  }
  140|  1.68M|}
highbd_inv_txfm_avx2.c:highbd_get_recon_8x8_avx2:
  118|  17.7M|                                                const int bd) {
  119|  17.7M|  __m256i x0 = pred;
  120|  17.7M|  x0 = _mm256_add_epi32(res, x0);
  121|  17.7M|  x0 = _mm256_packus_epi32(x0, x0);
  122|  17.7M|  x0 = _mm256_permute4x64_epi64(x0, 0xd8);
  123|  17.7M|  x0 = highbd_clamp_epi16_avx2(x0, bd);
  124|  17.7M|  return x0;
  125|  17.7M|}

av1_highbd_iwht4x4_16_add_sse4_1:
  149|   413k|                                      int stride, int bd) {
  150|       |  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
  151|       |     0.5 shifts per pixel. */
  152|   413k|  __m128i op[4];
  153|   413k|  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  ------------------
  |  |   75|   413k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  154|       |
  155|   413k|  load_buffer_4x4(input, op);
  156|       |
  157|       |  // Shift before-hand.
  158|   413k|  op[0] = _mm_srai_epi32(op[0], UNIT_QUANT_SHIFT);
  ------------------
  |  |   21|   413k|#define UNIT_QUANT_SHIFT 2
  ------------------
  159|   413k|  op[1] = _mm_srai_epi32(op[1], UNIT_QUANT_SHIFT);
  ------------------
  |  |   21|   413k|#define UNIT_QUANT_SHIFT 2
  ------------------
  160|   413k|  op[2] = _mm_srai_epi32(op[2], UNIT_QUANT_SHIFT);
  ------------------
  |  |   21|   413k|#define UNIT_QUANT_SHIFT 2
  ------------------
  161|   413k|  op[3] = _mm_srai_epi32(op[3], UNIT_QUANT_SHIFT);
  ------------------
  |  |   21|   413k|#define UNIT_QUANT_SHIFT 2
  ------------------
  162|       |
  163|  1.24M|  for (int i = 0; i < 2; ++i) {
  ------------------
  |  Branch (163:19): [True: 826k, False: 413k]
  ------------------
  164|   826k|    __m128i a1 = op[0];
  165|   826k|    __m128i c1 = op[1];
  166|   826k|    __m128i d1 = op[2];
  167|   826k|    __m128i b1 = op[3];
  168|   826k|    a1 = _mm_add_epi32(a1, c1);          // a1 += c1
  169|   826k|    d1 = _mm_sub_epi32(d1, b1);          // d1 -= b1
  170|   826k|    __m128i e1 = _mm_sub_epi32(a1, d1);  // e1 = (a1 - d1) >> 1
  171|   826k|    e1 = _mm_srai_epi32(e1, 1);
  172|   826k|    b1 = _mm_sub_epi32(e1, b1);  // b1 = e1 - b1
  173|   826k|    c1 = _mm_sub_epi32(e1, c1);  // c1 = e1 - c1
  174|   826k|    a1 = _mm_sub_epi32(a1, b1);  // a1 -= b1
  175|   826k|    d1 = _mm_add_epi32(d1, c1);  // d1 += c1
  176|       |
  177|   826k|    op[0] = a1;
  178|   826k|    op[1] = b1;
  179|   826k|    op[2] = c1;
  180|   826k|    op[3] = d1;
  181|   826k|    if (i == 0) {
  ------------------
  |  Branch (181:9): [True: 413k, False: 413k]
  ------------------
  182|   413k|      transpose_32bit_4x4(op, op);
  183|   413k|    }
  184|   826k|  }
  185|       |
  186|       |  // Convert to int16_t. The C code checks that we are in range.
  187|   413k|  op[0] = _mm_packs_epi32(op[0], op[1]);
  188|   413k|  op[1] = _mm_packs_epi32(op[2], op[3]);
  189|       |
  190|       |  // Load uint16_t.
  191|   413k|  __m128i dst[2];
  192|   413k|  __m128i tmp[4];
  193|   413k|  tmp[0] = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
  194|   413k|  tmp[1] = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride));
  195|   413k|  dst[0] = _mm_unpacklo_epi64(tmp[0], tmp[1]);
  196|   413k|  tmp[2] = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride));
  197|   413k|  tmp[3] = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride));
  198|   413k|  dst[1] = _mm_unpacklo_epi64(tmp[2], tmp[3]);
  199|       |
  200|       |  // Add to the previous results.
  201|   413k|  dst[0] = _mm_add_epi16(dst[0], op[0]);
  202|   413k|  dst[1] = _mm_add_epi16(dst[1], op[1]);
  203|       |
  204|       |  // Clamp.
  205|   413k|  dst[0] = highbd_clamp_epi16(dst[0], bd);
  206|   413k|  dst[1] = highbd_clamp_epi16(dst[1], bd);
  207|       |
  208|       |  // Store.
  209|   413k|  _mm_storel_epi64((__m128i *)(dest + 0 * stride), dst[0]);
  210|   413k|  dst[0] = _mm_srli_si128(dst[0], 8);
  211|   413k|  _mm_storel_epi64((__m128i *)(dest + 1 * stride), dst[0]);
  212|   413k|  _mm_storel_epi64((__m128i *)(dest + 2 * stride), dst[1]);
  213|   413k|  dst[1] = _mm_srli_si128(dst[1], 8);
  214|   413k|  _mm_storel_epi64((__m128i *)(dest + 3 * stride), dst[1]);
  215|   413k|}
av1_inv_txfm2d_add_4x4_sse4_1:
  722|  1.32M|                                   int stride, TX_TYPE tx_type, int bd) {
  723|  1.32M|  __m128i in[4];
  724|  1.32M|  const int8_t *shift = av1_inv_txfm_shift_ls[TX_4X4];
  725|       |
  726|  1.32M|  switch (tx_type) {
  727|   385k|    case DCT_DCT:
  ------------------
  |  Branch (727:5): [True: 385k, False: 943k]
  ------------------
  728|   385k|      load_buffer_4x4(input, in);
  729|   385k|      idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|   385k|#define INV_COS_BIT 12
  ------------------
  730|   385k|      transpose_32bit_4x4(in, in);
  731|   385k|      idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|   385k|#define INV_COS_BIT 12
  ------------------
  732|   385k|      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
  733|   385k|      break;
  734|   113k|    case ADST_DCT:
  ------------------
  |  Branch (734:5): [True: 113k, False: 1.21M]
  ------------------
  735|   113k|      load_buffer_4x4(input, in);
  736|   113k|      idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|   113k|#define INV_COS_BIT 12
  ------------------
  737|   113k|      transpose_32bit_4x4(in, in);
  738|   113k|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|   113k|#define INV_COS_BIT 12
  ------------------
  739|   113k|      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
  740|   113k|      break;
  741|   168k|    case DCT_ADST:
  ------------------
  |  Branch (741:5): [True: 168k, False: 1.16M]
  ------------------
  742|   168k|      load_buffer_4x4(input, in);
  743|   168k|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|   168k|#define INV_COS_BIT 12
  ------------------
  744|   168k|      transpose_32bit_4x4(in, in);
  745|   168k|      idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|   168k|#define INV_COS_BIT 12
  ------------------
  746|   168k|      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
  747|   168k|      break;
  748|   173k|    case ADST_ADST:
  ------------------
  |  Branch (748:5): [True: 173k, False: 1.15M]
  ------------------
  749|   173k|      load_buffer_4x4(input, in);
  750|   173k|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|   173k|#define INV_COS_BIT 12
  ------------------
  751|   173k|      transpose_32bit_4x4(in, in);
  752|   173k|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|   173k|#define INV_COS_BIT 12
  ------------------
  753|   173k|      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
  754|   173k|      break;
  755|  6.28k|    case FLIPADST_DCT:
  ------------------
  |  Branch (755:5): [True: 6.28k, False: 1.32M]
  ------------------
  756|  6.28k|      load_buffer_4x4(input, in);
  757|  6.28k|      idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|  6.28k|#define INV_COS_BIT 12
  ------------------
  758|  6.28k|      transpose_32bit_4x4(in, in);
  759|  6.28k|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|  6.28k|#define INV_COS_BIT 12
  ------------------
  760|  6.28k|      write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
  761|  6.28k|      break;
  762|  24.4k|    case DCT_FLIPADST:
  ------------------
  |  Branch (762:5): [True: 24.4k, False: 1.30M]
  ------------------
  763|  24.4k|      load_buffer_4x4(input, in);
  764|  24.4k|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|  24.4k|#define INV_COS_BIT 12
  ------------------
  765|  24.4k|      transpose_32bit_4x4(in, in);
  766|  24.4k|      idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|  24.4k|#define INV_COS_BIT 12
  ------------------
  767|  24.4k|      write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
  768|  24.4k|      break;
  769|  18.8k|    case FLIPADST_FLIPADST:
  ------------------
  |  Branch (769:5): [True: 18.8k, False: 1.31M]
  ------------------
  770|  18.8k|      load_buffer_4x4(input, in);
  771|  18.8k|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|  18.8k|#define INV_COS_BIT 12
  ------------------
  772|  18.8k|      transpose_32bit_4x4(in, in);
  773|  18.8k|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|  18.8k|#define INV_COS_BIT 12
  ------------------
  774|  18.8k|      write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
  775|  18.8k|      break;
  776|  8.73k|    case ADST_FLIPADST:
  ------------------
  |  Branch (776:5): [True: 8.73k, False: 1.32M]
  ------------------
  777|  8.73k|      load_buffer_4x4(input, in);
  778|  8.73k|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|  8.73k|#define INV_COS_BIT 12
  ------------------
  779|  8.73k|      transpose_32bit_4x4(in, in);
  780|  8.73k|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|  8.73k|#define INV_COS_BIT 12
  ------------------
  781|  8.73k|      write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
  782|  8.73k|      break;
  783|  25.2k|    case FLIPADST_ADST:
  ------------------
  |  Branch (783:5): [True: 25.2k, False: 1.30M]
  ------------------
  784|  25.2k|      load_buffer_4x4(input, in);
  785|  25.2k|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|  25.2k|#define INV_COS_BIT 12
  ------------------
  786|  25.2k|      transpose_32bit_4x4(in, in);
  787|  25.2k|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|  25.2k|#define INV_COS_BIT 12
  ------------------
  788|  25.2k|      write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
  789|  25.2k|      break;
  790|   318k|    case IDTX:
  ------------------
  |  Branch (790:5): [True: 318k, False: 1.01M]
  ------------------
  791|   318k|      load_buffer_4x4(input, in);
  792|   318k|      iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|   318k|#define INV_COS_BIT 12
  ------------------
  793|   318k|      transpose_32bit_4x4(in, in);
  794|   318k|      iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|   318k|#define INV_COS_BIT 12
  ------------------
  795|   318k|      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
  796|   318k|      break;
  797|  7.87k|    case V_DCT:
  ------------------
  |  Branch (797:5): [True: 7.87k, False: 1.32M]
  ------------------
  798|  7.87k|      load_buffer_4x4(input, in);
  799|  7.87k|      iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|  7.87k|#define INV_COS_BIT 12
  ------------------
  800|  7.87k|      transpose_32bit_4x4(in, in);
  801|  7.87k|      idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|  7.87k|#define INV_COS_BIT 12
  ------------------
  802|  7.87k|      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
  803|  7.87k|      break;
  804|  25.4k|    case H_DCT:
  ------------------
  |  Branch (804:5): [True: 25.4k, False: 1.30M]
  ------------------
  805|  25.4k|      load_buffer_4x4(input, in);
  806|  25.4k|      idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|  25.4k|#define INV_COS_BIT 12
  ------------------
  807|  25.4k|      transpose_32bit_4x4(in, in);
  808|  25.4k|      iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|  25.4k|#define INV_COS_BIT 12
  ------------------
  809|  25.4k|      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
  810|  25.4k|      break;
  811|  17.1k|    case V_ADST:
  ------------------
  |  Branch (811:5): [True: 17.1k, False: 1.31M]
  ------------------
  812|  17.1k|      load_buffer_4x4(input, in);
  813|  17.1k|      iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|  17.1k|#define INV_COS_BIT 12
  ------------------
  814|  17.1k|      transpose_32bit_4x4(in, in);
  815|  17.1k|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|  17.1k|#define INV_COS_BIT 12
  ------------------
  816|  17.1k|      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
  817|  17.1k|      break;
  818|  7.37k|    case H_ADST:
  ------------------
  |  Branch (818:5): [True: 7.37k, False: 1.32M]
  ------------------
  819|  7.37k|      load_buffer_4x4(input, in);
  820|  7.37k|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|  7.37k|#define INV_COS_BIT 12
  ------------------
  821|  7.37k|      transpose_32bit_4x4(in, in);
  822|  7.37k|      iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|  7.37k|#define INV_COS_BIT 12
  ------------------
  823|  7.37k|      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
  824|  7.37k|      break;
  825|  9.26k|    case V_FLIPADST:
  ------------------
  |  Branch (825:5): [True: 9.26k, False: 1.31M]
  ------------------
  826|  9.26k|      load_buffer_4x4(input, in);
  827|  9.26k|      iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|  9.26k|#define INV_COS_BIT 12
  ------------------
  828|  9.26k|      transpose_32bit_4x4(in, in);
  829|  9.26k|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|  9.26k|#define INV_COS_BIT 12
  ------------------
  830|  9.26k|      write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
  831|  9.26k|      break;
  832|  17.4k|    case H_FLIPADST:
  ------------------
  |  Branch (832:5): [True: 17.4k, False: 1.31M]
  ------------------
  833|  17.4k|      load_buffer_4x4(input, in);
  834|  17.4k|      iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
  ------------------
  |  |   43|  17.4k|#define INV_COS_BIT 12
  ------------------
  835|  17.4k|      transpose_32bit_4x4(in, in);
  836|  17.4k|      iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|  17.4k|#define INV_COS_BIT 12
  ------------------
  837|  17.4k|      write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
  838|  17.4k|      break;
  839|      0|    default: assert(0);
  ------------------
  |  Branch (839:5): [True: 0, False: 1.32M]
  ------------------
  840|  1.32M|  }
  841|  1.32M|}
av1_highbd_inv_txfm2d_add_universe_sse4_1:
 5720|   620k|                                               int eob, const int bd) {
 5721|   620k|  switch (tx_type) {
 5722|      0|    case DCT_DCT:
  ------------------
  |  Branch (5722:5): [True: 0, False: 620k]
  ------------------
 5723|      0|    case ADST_DCT:
  ------------------
  |  Branch (5723:5): [True: 0, False: 620k]
  ------------------
 5724|      0|    case DCT_ADST:
  ------------------
  |  Branch (5724:5): [True: 0, False: 620k]
  ------------------
 5725|      0|    case ADST_ADST:
  ------------------
  |  Branch (5725:5): [True: 0, False: 620k]
  ------------------
 5726|      0|    case FLIPADST_DCT:
  ------------------
  |  Branch (5726:5): [True: 0, False: 620k]
  ------------------
 5727|      0|    case DCT_FLIPADST:
  ------------------
  |  Branch (5727:5): [True: 0, False: 620k]
  ------------------
 5728|      0|    case FLIPADST_FLIPADST:
  ------------------
  |  Branch (5728:5): [True: 0, False: 620k]
  ------------------
 5729|      0|    case ADST_FLIPADST:
  ------------------
  |  Branch (5729:5): [True: 0, False: 620k]
  ------------------
 5730|      0|    case FLIPADST_ADST:
  ------------------
  |  Branch (5730:5): [True: 0, False: 620k]
  ------------------
 5731|      0|      highbd_inv_txfm2d_add_no_identity_sse41(
 5732|      0|          input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
  ------------------
  |  |   75|      0|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 5733|      0|          bd);
 5734|      0|      break;
 5735|  29.3k|    case V_DCT:
  ------------------
  |  Branch (5735:5): [True: 29.3k, False: 591k]
  ------------------
 5736|  53.4k|    case V_ADST:
  ------------------
  |  Branch (5736:5): [True: 24.1k, False: 596k]
  ------------------
 5737|  58.7k|    case V_FLIPADST:
  ------------------
  |  Branch (5737:5): [True: 5.33k, False: 615k]
  ------------------
 5738|  58.7k|      highbd_inv_txfm2d_add_h_identity_ssse41(
 5739|  58.7k|          input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
  ------------------
  |  |   75|  58.7k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 5740|  58.7k|          bd);
 5741|  58.7k|      break;
 5742|   144k|    case H_DCT:
  ------------------
  |  Branch (5742:5): [True: 144k, False: 476k]
  ------------------
 5743|   165k|    case H_ADST:
  ------------------
  |  Branch (5743:5): [True: 21.0k, False: 599k]
  ------------------
 5744|   177k|    case H_FLIPADST:
  ------------------
  |  Branch (5744:5): [True: 11.9k, False: 608k]
  ------------------
 5745|   177k|      highbd_inv_txfm2d_add_v_identity_ssse41(
 5746|   177k|          input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
  ------------------
  |  |   75|   177k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 5747|   177k|          bd);
 5748|   177k|      break;
 5749|   384k|    case IDTX:
  ------------------
  |  Branch (5749:5): [True: 384k, False: 235k]
  ------------------
 5750|   384k|      highbd_inv_txfm2d_add_idtx_ssse41(input, CONVERT_TO_SHORTPTR(output),
  ------------------
  |  |   75|   384k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 5751|   384k|                                        stride, tx_type, tx_size, eob, bd);
 5752|   384k|      break;
 5753|      0|    default: assert(0); break;
  ------------------
  |  Branch (5753:5): [True: 0, False: 620k]
  ------------------
 5754|   620k|  }
 5755|   620k|}
av1_highbd_inv_txfm_add_sse4_1:
 5802|  4.91M|                                    int stride, const TxfmParam *txfm_param) {
 5803|  4.91M|  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
 5804|  4.91M|  const TX_SIZE tx_size = txfm_param->tx_size;
 5805|  4.91M|  switch (tx_size) {
 5806|      0|    case TX_8X8:
  ------------------
  |  Branch (5806:5): [True: 0, False: 4.91M]
  ------------------
 5807|      0|      av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
 5808|      0|      break;
 5809|   496k|    case TX_4X8:
  ------------------
  |  Branch (5809:5): [True: 496k, False: 4.41M]
  ------------------
 5810|   496k|      av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param);
 5811|   496k|      break;
 5812|   937k|    case TX_8X4:
  ------------------
  |  Branch (5812:5): [True: 937k, False: 3.97M]
  ------------------
 5813|   937k|      av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param);
 5814|   937k|      break;
 5815|  2.37M|    case TX_4X4:
  ------------------
  |  Branch (5815:5): [True: 2.37M, False: 2.53M]
  ------------------
 5816|  2.37M|      av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
 5817|  2.37M|      break;
 5818|   675k|    case TX_16X4:
  ------------------
  |  Branch (5818:5): [True: 675k, False: 4.23M]
  ------------------
 5819|   675k|      av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param);
 5820|   675k|      break;
 5821|   429k|    case TX_4X16:
  ------------------
  |  Branch (5821:5): [True: 429k, False: 4.48M]
  ------------------
 5822|   429k|      av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param);
 5823|   429k|      break;
 5824|      0|    default:
  ------------------
  |  Branch (5824:5): [True: 0, False: 4.91M]
  ------------------
 5825|      0|      av1_highbd_inv_txfm2d_add_universe_sse4_1(
 5826|      0|          input, dest, stride, txfm_param->tx_type, tx_size, txfm_param->eob,
 5827|      0|          txfm_param->bd);
 5828|      0|      break;
 5829|  4.91M|  }
 5830|  4.91M|}
highbd_inv_txfm_sse4.c:load_buffer_4x4:
  141|  1.74M|static inline void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
  142|  1.74M|  in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
  143|  1.74M|  in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
  144|  1.74M|  in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
  145|  1.74M|  in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
  146|  1.74M|}
highbd_inv_txfm_sse4.c:highbd_clamp_epi16:
   24|  14.3M|static inline __m128i highbd_clamp_epi16(__m128i u, int bd) {
   25|  14.3M|  const __m128i zero = _mm_setzero_si128();
   26|  14.3M|  const __m128i one = _mm_set1_epi16(1);
   27|  14.3M|  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
   28|  14.3M|  __m128i clamped, mask;
   29|       |
   30|  14.3M|  mask = _mm_cmpgt_epi16(u, max);
   31|  14.3M|  clamped = _mm_andnot_si128(mask, u);
   32|  14.3M|  mask = _mm_and_si128(mask, max);
   33|  14.3M|  clamped = _mm_or_si128(mask, clamped);
   34|  14.3M|  mask = _mm_cmpgt_epi16(clamped, zero);
   35|  14.3M|  clamped = _mm_and_si128(clamped, mask);
   36|       |
   37|  14.3M|  return clamped;
   38|  14.3M|}
highbd_inv_txfm_sse4.c:idct4x4_sse4_1:
  456|  4.55M|                           int bd, int out_shift) {
  457|  4.55M|  const int32_t *cospi = cospi_arr(bit);
  458|  4.55M|  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
  459|  4.55M|  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
  460|  4.55M|  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
  461|  4.55M|  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
  462|  4.55M|  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
  463|  4.55M|  int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|  9.10M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 4.55M]
  |  |  |  Branch (35:31): [True: 2.87M, False: 1.67M]
  |  |  |  Branch (35:44): [True: 2.87M, False: 1.67M]
  |  |  ------------------
  ------------------
  464|  4.55M|  __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
  465|  4.55M|  __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
  466|  4.55M|  __m128i u0, u1, u2, u3;
  467|  4.55M|  __m128i v0, v1, v2, v3, x, y;
  468|       |
  469|       |  // Stage 0
  470|       |  // Stage 1
  471|       |  // Stage 2
  472|  4.55M|  u0 = in[0];
  473|  4.55M|  u1 = in[1];
  474|  4.55M|  u2 = in[2];
  475|  4.55M|  u3 = in[3];
  476|       |
  477|  4.55M|  x = _mm_mullo_epi32(u0, cospi32);
  478|  4.55M|  y = _mm_mullo_epi32(u2, cospi32);
  479|  4.55M|  v0 = _mm_add_epi32(x, y);
  480|  4.55M|  v0 = _mm_add_epi32(v0, rnding);
  481|  4.55M|  v0 = _mm_srai_epi32(v0, bit);
  482|       |
  483|  4.55M|  v1 = _mm_sub_epi32(x, y);
  484|  4.55M|  v1 = _mm_add_epi32(v1, rnding);
  485|  4.55M|  v1 = _mm_srai_epi32(v1, bit);
  486|       |
  487|  4.55M|  x = _mm_mullo_epi32(u1, cospi48);
  488|  4.55M|  y = _mm_mullo_epi32(u3, cospim16);
  489|  4.55M|  v2 = _mm_add_epi32(x, y);
  490|  4.55M|  v2 = _mm_add_epi32(v2, rnding);
  491|  4.55M|  v2 = _mm_srai_epi32(v2, bit);
  492|       |
  493|  4.55M|  x = _mm_mullo_epi32(u1, cospi16);
  494|  4.55M|  y = _mm_mullo_epi32(u3, cospi48);
  495|  4.55M|  v3 = _mm_add_epi32(x, y);
  496|  4.55M|  v3 = _mm_add_epi32(v3, rnding);
  497|  4.55M|  v3 = _mm_srai_epi32(v3, bit);
  498|       |
  499|       |  // Stage 3
  500|  4.55M|  addsub_sse4_1(v0, v3, out + 0, out + 3, &clamp_lo, &clamp_hi);
  501|  4.55M|  addsub_sse4_1(v1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi);
  502|       |
  503|  4.55M|  if (!do_cols) {
  ------------------
  |  Branch (503:7): [True: 1.67M, False: 2.87M]
  ------------------
  504|  1.67M|    log_range = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  1.67M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 1.67M]
  |  |  ------------------
  ------------------
  505|  1.67M|    clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
  506|  1.67M|    clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
  507|       |
  508|  1.67M|    shift_and_clamp_sse4_1(out + 0, out + 3, &clamp_lo, &clamp_hi, out_shift);
  509|  1.67M|    shift_and_clamp_sse4_1(out + 1, out + 2, &clamp_lo, &clamp_hi, out_shift);
  510|  1.67M|  }
  511|  4.55M|}
highbd_inv_txfm_sse4.c:addsub_sse4_1:
  219|  47.9M|                          const __m128i *clamp_hi) {
  220|  47.9M|  __m128i a0 = _mm_add_epi32(in0, in1);
  221|  47.9M|  __m128i a1 = _mm_sub_epi32(in0, in1);
  222|       |
  223|  47.9M|  a0 = _mm_max_epi32(a0, *clamp_lo);
  224|  47.9M|  a0 = _mm_min_epi32(a0, *clamp_hi);
  225|  47.9M|  a1 = _mm_max_epi32(a1, *clamp_lo);
  226|  47.9M|  a1 = _mm_min_epi32(a1, *clamp_hi);
  227|       |
  228|  47.9M|  *out0 = a0;
  229|  47.9M|  *out1 = a1;
  230|  47.9M|}
highbd_inv_txfm_sse4.c:shift_and_clamp_sse4_1:
  234|  3.34M|                                   const __m128i *clamp_hi, int shift) {
  235|  3.34M|  __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
  236|  3.34M|  __m128i in0_w_offset = _mm_add_epi32(*in0, offset);
  237|  3.34M|  __m128i in1_w_offset = _mm_add_epi32(*in1, offset);
  238|       |
  239|  3.34M|  in0_w_offset = _mm_sra_epi32(in0_w_offset, _mm_cvtsi32_si128(shift));
  240|  3.34M|  in1_w_offset = _mm_sra_epi32(in1_w_offset, _mm_cvtsi32_si128(shift));
  241|       |
  242|  3.34M|  in0_w_offset = _mm_max_epi32(in0_w_offset, *clamp_lo);
  243|  3.34M|  in0_w_offset = _mm_min_epi32(in0_w_offset, *clamp_hi);
  244|  3.34M|  in1_w_offset = _mm_max_epi32(in1_w_offset, *clamp_lo);
  245|  3.34M|  in1_w_offset = _mm_min_epi32(in1_w_offset, *clamp_hi);
  246|       |
  247|  3.34M|  *in0 = in0_w_offset;
  248|  3.34M|  *in1 = in1_w_offset;
  249|  3.34M|}
highbd_inv_txfm_sse4.c:write_buffer_4x4:
  634|  1.32M|                             int fliplr, int flipud, int shift, int bd) {
  635|  1.32M|  const __m128i zero = _mm_setzero_si128();
  636|  1.32M|  __m128i u0, u1, u2, u3;
  637|  1.32M|  __m128i v0, v1, v2, v3;
  638|       |
  639|  1.32M|  round_shift_4x4(in, shift);
  640|       |
  641|  1.32M|  v0 = _mm_loadl_epi64((__m128i const *)(output + 0 * stride));
  642|  1.32M|  v1 = _mm_loadl_epi64((__m128i const *)(output + 1 * stride));
  643|  1.32M|  v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride));
  644|  1.32M|  v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride));
  645|       |
  646|  1.32M|  v0 = _mm_unpacklo_epi16(v0, zero);
  647|  1.32M|  v1 = _mm_unpacklo_epi16(v1, zero);
  648|  1.32M|  v2 = _mm_unpacklo_epi16(v2, zero);
  649|  1.32M|  v3 = _mm_unpacklo_epi16(v3, zero);
  650|       |
  651|  1.32M|  if (fliplr) {
  ------------------
  |  Branch (651:7): [True: 69.4k, False: 1.25M]
  ------------------
  652|  69.4k|    in[0] = _mm_shuffle_epi32(in[0], 0x1B);
  653|  69.4k|    in[1] = _mm_shuffle_epi32(in[1], 0x1B);
  654|  69.4k|    in[2] = _mm_shuffle_epi32(in[2], 0x1B);
  655|  69.4k|    in[3] = _mm_shuffle_epi32(in[3], 0x1B);
  656|  69.4k|  }
  657|       |
  658|  1.32M|  if (flipud) {
  ------------------
  |  Branch (658:7): [True: 59.6k, False: 1.26M]
  ------------------
  659|  59.6k|    u0 = _mm_add_epi32(in[3], v0);
  660|  59.6k|    u1 = _mm_add_epi32(in[2], v1);
  661|  59.6k|    u2 = _mm_add_epi32(in[1], v2);
  662|  59.6k|    u3 = _mm_add_epi32(in[0], v3);
  663|  1.26M|  } else {
  664|  1.26M|    u0 = _mm_add_epi32(in[0], v0);
  665|  1.26M|    u1 = _mm_add_epi32(in[1], v1);
  666|  1.26M|    u2 = _mm_add_epi32(in[2], v2);
  667|  1.26M|    u3 = _mm_add_epi32(in[3], v3);
  668|  1.26M|  }
  669|       |
  670|  1.32M|  v0 = _mm_packus_epi32(u0, u1);
  671|  1.32M|  v2 = _mm_packus_epi32(u2, u3);
  672|       |
  673|  1.32M|  u0 = highbd_clamp_epi16(v0, bd);
  674|  1.32M|  u2 = highbd_clamp_epi16(v2, bd);
  675|       |
  676|  1.32M|  v0 = _mm_unpacklo_epi64(u0, u0);
  677|  1.32M|  v1 = _mm_unpackhi_epi64(u0, u0);
  678|  1.32M|  v2 = _mm_unpacklo_epi64(u2, u2);
  679|  1.32M|  v3 = _mm_unpackhi_epi64(u2, u2);
  680|       |
  681|  1.32M|  _mm_storel_epi64((__m128i *)(output + 0 * stride), v0);
  682|  1.32M|  _mm_storel_epi64((__m128i *)(output + 1 * stride), v1);
  683|  1.32M|  _mm_storel_epi64((__m128i *)(output + 2 * stride), v2);
  684|  1.32M|  _mm_storel_epi64((__m128i *)(output + 3 * stride), v3);
  685|  1.32M|}
highbd_inv_txfm_sse4.c:round_shift_4x4:
   40|  11.0M|static inline void round_shift_4x4(__m128i *in, int shift) {
   41|  11.0M|  if (shift != 0) {
  ------------------
  |  Branch (41:7): [True: 8.56M, False: 2.49M]
  ------------------
   42|  8.56M|    __m128i rnding = _mm_set1_epi32(1 << (shift - 1));
   43|  8.56M|    in[0] = _mm_add_epi32(in[0], rnding);
   44|  8.56M|    in[1] = _mm_add_epi32(in[1], rnding);
   45|  8.56M|    in[2] = _mm_add_epi32(in[2], rnding);
   46|  8.56M|    in[3] = _mm_add_epi32(in[3], rnding);
   47|       |
   48|  8.56M|    in[0] = _mm_srai_epi32(in[0], shift);
   49|  8.56M|    in[1] = _mm_srai_epi32(in[1], shift);
   50|  8.56M|    in[2] = _mm_srai_epi32(in[2], shift);
   51|  8.56M|    in[3] = _mm_srai_epi32(in[3], shift);
   52|  8.56M|  }
   53|  11.0M|}
highbd_inv_txfm_sse4.c:iadst4x4_sse4_1:
  514|  3.23M|                            int bd, int out_shift) {
  515|  3.23M|  const int32_t *sinpi = sinpi_arr(bit);
  516|  3.23M|  const __m128i zero = _mm_setzero_si128();
  517|  3.23M|  __m128i rnding = _mm_set1_epi32(1 << (bit + 4 - 1));
  518|  3.23M|  rnding = _mm_unpacklo_epi32(rnding, zero);
  519|  3.23M|  const __m128i mul = _mm_set1_epi32(1 << 4);
  520|  3.23M|  const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
  521|  3.23M|  const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
  522|  3.23M|  const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
  523|  3.23M|  const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]);
  524|  3.23M|  __m128i t;
  525|  3.23M|  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
  526|  3.23M|  __m128i x0, x1, x2, x3;
  527|  3.23M|  __m128i u0, u1, u2, u3;
  528|  3.23M|  __m128i u0_low, u1_low, u2_low, u3_low;
  529|  3.23M|  __m128i u0_high, u1_high, u2_high, u3_high;
  530|       |
  531|  3.23M|  x0 = in[0];
  532|  3.23M|  x1 = in[1];
  533|  3.23M|  x2 = in[2];
  534|  3.23M|  x3 = in[3];
  535|       |
  536|  3.23M|  s0 = _mm_mullo_epi32(x0, sinpi1);
  537|  3.23M|  s1 = _mm_mullo_epi32(x0, sinpi2);
  538|  3.23M|  s2 = _mm_mullo_epi32(x1, sinpi3);
  539|  3.23M|  s3 = _mm_mullo_epi32(x2, sinpi4);
  540|  3.23M|  s4 = _mm_mullo_epi32(x2, sinpi1);
  541|  3.23M|  s5 = _mm_mullo_epi32(x3, sinpi2);
  542|  3.23M|  s6 = _mm_mullo_epi32(x3, sinpi4);
  543|  3.23M|  t = _mm_sub_epi32(x0, x2);
  544|  3.23M|  s7 = _mm_add_epi32(t, x3);
  545|       |
  546|  3.23M|  t = _mm_add_epi32(s0, s3);
  547|  3.23M|  s0 = _mm_add_epi32(t, s5);
  548|  3.23M|  t = _mm_sub_epi32(s1, s4);
  549|  3.23M|  s1 = _mm_sub_epi32(t, s6);
  550|  3.23M|  s3 = s2;
  551|  3.23M|  s2 = _mm_mullo_epi32(s7, sinpi3);
  552|       |
  553|  3.23M|  u0 = _mm_add_epi32(s0, s3);
  554|  3.23M|  u1 = _mm_add_epi32(s1, s3);
  555|  3.23M|  u2 = s2;
  556|  3.23M|  t = _mm_add_epi32(s0, s1);
  557|  3.23M|  u3 = _mm_sub_epi32(t, s3);
  558|       |
  559|       |  // u0
  560|  3.23M|  u0_low = _mm_mul_epi32(u0, mul);
  561|  3.23M|  u0_low = _mm_add_epi64(u0_low, rnding);
  562|       |
  563|  3.23M|  u0 = _mm_srli_si128(u0, 4);
  564|  3.23M|  u0_high = _mm_mul_epi32(u0, mul);
  565|  3.23M|  u0_high = _mm_add_epi64(u0_high, rnding);
  566|       |
  567|  3.23M|  u0_low = _mm_srli_si128(u0_low, 2);
  568|  3.23M|  u0_high = _mm_srli_si128(u0_high, 2);
  569|       |
  570|  3.23M|  u0 = _mm_unpacklo_epi32(u0_low, u0_high);
  571|  3.23M|  u0_high = _mm_unpackhi_epi32(u0_low, u0_high);
  572|  3.23M|  u0 = _mm_unpacklo_epi64(u0, u0_high);
  573|       |
  574|       |  // u1
  575|  3.23M|  u1_low = _mm_mul_epi32(u1, mul);
  576|  3.23M|  u1_low = _mm_add_epi64(u1_low, rnding);
  577|       |
  578|  3.23M|  u1 = _mm_srli_si128(u1, 4);
  579|  3.23M|  u1_high = _mm_mul_epi32(u1, mul);
  580|  3.23M|  u1_high = _mm_add_epi64(u1_high, rnding);
  581|       |
  582|  3.23M|  u1_low = _mm_srli_si128(u1_low, 2);
  583|  3.23M|  u1_high = _mm_srli_si128(u1_high, 2);
  584|       |
  585|  3.23M|  u1 = _mm_unpacklo_epi32(u1_low, u1_high);
  586|  3.23M|  u1_high = _mm_unpackhi_epi32(u1_low, u1_high);
  587|  3.23M|  u1 = _mm_unpacklo_epi64(u1, u1_high);
  588|       |
  589|       |  // u2
  590|  3.23M|  u2_low = _mm_mul_epi32(u2, mul);
  591|  3.23M|  u2_low = _mm_add_epi64(u2_low, rnding);
  592|       |
  593|  3.23M|  u2 = _mm_srli_si128(u2, 4);
  594|  3.23M|  u2_high = _mm_mul_epi32(u2, mul);
  595|  3.23M|  u2_high = _mm_add_epi64(u2_high, rnding);
  596|       |
  597|  3.23M|  u2_low = _mm_srli_si128(u2_low, 2);
  598|  3.23M|  u2_high = _mm_srli_si128(u2_high, 2);
  599|       |
  600|  3.23M|  u2 = _mm_unpacklo_epi32(u2_low, u2_high);
  601|  3.23M|  u2_high = _mm_unpackhi_epi32(u2_low, u2_high);
  602|  3.23M|  u2 = _mm_unpacklo_epi64(u2, u2_high);
  603|       |
  604|       |  // u3
  605|  3.23M|  u3_low = _mm_mul_epi32(u3, mul);
  606|  3.23M|  u3_low = _mm_add_epi64(u3_low, rnding);
  607|       |
  608|  3.23M|  u3 = _mm_srli_si128(u3, 4);
  609|  3.23M|  u3_high = _mm_mul_epi32(u3, mul);
  610|  3.23M|  u3_high = _mm_add_epi64(u3_high, rnding);
  611|       |
  612|  3.23M|  u3_low = _mm_srli_si128(u3_low, 2);
  613|  3.23M|  u3_high = _mm_srli_si128(u3_high, 2);
  614|       |
  615|  3.23M|  u3 = _mm_unpacklo_epi32(u3_low, u3_high);
  616|  3.23M|  u3_high = _mm_unpackhi_epi32(u3_low, u3_high);
  617|  3.23M|  u3 = _mm_unpacklo_epi64(u3, u3_high);
  618|       |
  619|  3.23M|  out[0] = u0;
  620|  3.23M|  out[1] = u1;
  621|  3.23M|  out[2] = u2;
  622|  3.23M|  out[3] = u3;
  623|       |
  624|  3.23M|  if (!do_cols) {
  ------------------
  |  Branch (624:7): [True: 1.47M, False: 1.76M]
  ------------------
  625|  1.47M|    const int log_range = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  1.47M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 1.47M]
  |  |  ------------------
  ------------------
  626|  1.47M|    const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
  627|  1.47M|    const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
  628|  1.47M|    round_shift_4x4(out, out_shift);
  629|  1.47M|    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
  630|  1.47M|  }
  631|  3.23M|}
highbd_inv_txfm_sse4.c:highbd_clamp_epi32_sse4_1:
   64|  4.86M|                                      const __m128i *clamp_hi, int size) {
   65|  4.86M|  __m128i a0, a1;
   66|  14.5M|  for (int i = 0; i < size; i += 4) {
  ------------------
  |  Branch (66:19): [True: 9.72M, False: 4.86M]
  ------------------
   67|  9.72M|    a0 = _mm_max_epi32(in[i], *clamp_lo);
   68|  9.72M|    out[i] = _mm_min_epi32(a0, *clamp_hi);
   69|       |
   70|  9.72M|    a1 = _mm_max_epi32(in[i + 1], *clamp_lo);
   71|  9.72M|    out[i + 1] = _mm_min_epi32(a1, *clamp_hi);
   72|       |
   73|  9.72M|    a0 = _mm_max_epi32(in[i + 2], *clamp_lo);
   74|  9.72M|    out[i + 2] = _mm_min_epi32(a0, *clamp_hi);
   75|       |
   76|  9.72M|    a1 = _mm_max_epi32(in[i + 3], *clamp_lo);
   77|  9.72M|    out[i + 3] = _mm_min_epi32(a1, *clamp_hi);
   78|  9.72M|  }
   79|  4.86M|}
highbd_inv_txfm_sse4.c:iidentity4_sse4_1:
  688|  2.15M|                              int bd, int out_shift) {
  689|  2.15M|  (void)bit;
  690|  2.15M|  __m128i zero = _mm_setzero_si128();
  691|  2.15M|  __m128i fact = _mm_set1_epi32(NewSqrt2);
  692|  2.15M|  __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
  ------------------
  |  |   41|  2.15M|#define NewSqrt2Bits ((int32_t)12)
  ------------------
  693|  2.15M|  __m128i a0_low, a1_low;
  694|  2.15M|  __m128i a0_high, a1_high;
  695|       |
  696|  2.15M|  offset = _mm_unpacklo_epi32(offset, zero);
  697|       |
  698|  10.7M|  for (int i = 0; i < 4; i++) {
  ------------------
  |  Branch (698:19): [True: 8.60M, False: 2.15M]
  ------------------
  699|  8.60M|    a0_low = _mm_mul_epi32(in[i], fact);
  700|  8.60M|    a0_low = _mm_add_epi32(a0_low, offset);
  701|  8.60M|    a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits);
  ------------------
  |  |   41|  8.60M|#define NewSqrt2Bits ((int32_t)12)
  ------------------
  702|       |
  703|  8.60M|    a0_high = _mm_srli_si128(in[i], 4);
  704|  8.60M|    a0_high = _mm_mul_epi32(a0_high, fact);
  705|  8.60M|    a0_high = _mm_add_epi32(a0_high, offset);
  706|  8.60M|    a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits);
  ------------------
  |  |   41|  8.60M|#define NewSqrt2Bits ((int32_t)12)
  ------------------
  707|       |
  708|  8.60M|    a1_low = _mm_unpacklo_epi32(a0_low, a0_high);
  709|  8.60M|    a1_high = _mm_unpackhi_epi32(a0_low, a0_high);
  710|  8.60M|    out[i] = _mm_unpacklo_epi64(a1_low, a1_high);
  711|  8.60M|  }
  712|       |
  713|  2.15M|  if (!do_cols) {
  ------------------
  |  Branch (713:7): [True: 890k, False: 1.26M]
  ------------------
  714|   890k|    const int log_range = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|   890k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 890k]
  |  |  ------------------
  ------------------
  715|   890k|    const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
  716|   890k|    const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
  717|   890k|    round_shift_4x4(out, out_shift);
  718|   890k|    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
  719|   890k|  }
  720|  2.15M|}
highbd_inv_txfm_sse4.c:round_shift_8x8:
   55|  1.11M|static void round_shift_8x8(__m128i *in, int shift) {
   56|  1.11M|  round_shift_4x4(&in[0], shift);
   57|  1.11M|  round_shift_4x4(&in[4], shift);
   58|  1.11M|  round_shift_4x4(&in[8], shift);
   59|  1.11M|  round_shift_4x4(&in[12], shift);
   60|  1.11M|}
highbd_inv_txfm_sse4.c:neg_shift_sse4_1:
  438|  3.91M|                             int shift) {
  439|  3.91M|  __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
  440|  3.91M|  __m128i a0 = _mm_add_epi32(offset, in0);
  441|  3.91M|  __m128i a1 = _mm_sub_epi32(offset, in1);
  442|       |
  443|  3.91M|  a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
  444|  3.91M|  a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
  445|       |
  446|  3.91M|  a0 = _mm_max_epi32(a0, *clamp_lo);
  447|  3.91M|  a0 = _mm_min_epi32(a0, *clamp_hi);
  448|  3.91M|  a1 = _mm_max_epi32(a1, *clamp_lo);
  449|  3.91M|  a1 = _mm_min_epi32(a1, *clamp_hi);
  450|       |
  451|  3.91M|  *out0 = a0;
  452|  3.91M|  *out1 = a1;
  453|  3.91M|}
highbd_inv_txfm_sse4.c:idct8x8_low1_sse4_1:
 1471|  22.6k|                                int bd, int out_shift) {
 1472|  22.6k|  const int32_t *cospi = cospi_arr(bit);
 1473|  22.6k|  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
 1474|  22.6k|  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
 1475|  22.6k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|  45.3k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 22.6k]
  |  |  |  Branch (35:31): [True: 3.42k, False: 19.2k]
  |  |  |  Branch (35:44): [True: 3.42k, False: 19.2k]
  |  |  ------------------
  ------------------
 1476|  22.6k|  __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
 1477|  22.6k|  __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
 1478|  22.6k|  __m128i x;
 1479|       |
 1480|       |  // stage 0
 1481|       |  // stage 1
 1482|       |  // stage 2
 1483|       |  // stage 3
 1484|  22.6k|  x = _mm_mullo_epi32(in[0], cospi32);
 1485|  22.6k|  x = _mm_add_epi32(x, rnding);
 1486|  22.6k|  x = _mm_srai_epi32(x, bit);
 1487|       |
 1488|       |  // stage 4
 1489|       |  // stage 5
 1490|  22.6k|  if (!do_cols) {
  ------------------
  |  Branch (1490:7): [True: 19.2k, False: 3.42k]
  ------------------
 1491|  19.2k|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  19.2k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 19.2k]
  |  |  ------------------
  ------------------
 1492|  19.2k|    clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1)));
 1493|  19.2k|    clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
 1494|       |
 1495|  19.2k|    __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
 1496|  19.2k|    x = _mm_add_epi32(x, offset);
 1497|  19.2k|    x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
 1498|  19.2k|  }
 1499|       |
 1500|  22.6k|  x = _mm_max_epi32(x, clamp_lo);
 1501|  22.6k|  x = _mm_min_epi32(x, clamp_hi);
 1502|  22.6k|  out[0] = x;
 1503|  22.6k|  out[1] = x;
 1504|  22.6k|  out[2] = x;
 1505|  22.6k|  out[3] = x;
 1506|  22.6k|  out[4] = x;
 1507|  22.6k|  out[5] = x;
 1508|  22.6k|  out[6] = x;
 1509|  22.6k|  out[7] = x;
 1510|  22.6k|}
highbd_inv_txfm_sse4.c:idct8x8_new_sse4_1:
 1513|   808k|                               int bd, int out_shift) {
 1514|   808k|  const int32_t *cospi = cospi_arr(bit);
 1515|   808k|  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
 1516|   808k|  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
 1517|   808k|  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
 1518|   808k|  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
 1519|   808k|  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
 1520|   808k|  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
 1521|   808k|  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
 1522|   808k|  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
 1523|   808k|  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
 1524|   808k|  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
 1525|   808k|  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
 1526|   808k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|  1.61M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 808k]
  |  |  |  Branch (35:31): [True: 266k, False: 542k]
  |  |  |  Branch (35:44): [True: 266k, False: 542k]
  |  |  ------------------
  ------------------
 1527|   808k|  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
 1528|   808k|  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
 1529|   808k|  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
 1530|   808k|  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
 1531|   808k|  __m128i x, y;
 1532|       |
 1533|       |  // stage 0
 1534|       |  // stage 1
 1535|       |  // stage 2
 1536|   808k|  u0 = in[0];
 1537|   808k|  u1 = in[4];
 1538|   808k|  u2 = in[2];
 1539|   808k|  u3 = in[6];
 1540|       |
 1541|   808k|  x = _mm_mullo_epi32(in[1], cospi56);
 1542|   808k|  y = _mm_mullo_epi32(in[7], cospim8);
 1543|   808k|  u4 = _mm_add_epi32(x, y);
 1544|   808k|  u4 = _mm_add_epi32(u4, rnding);
 1545|   808k|  u4 = _mm_srai_epi32(u4, bit);
 1546|       |
 1547|   808k|  x = _mm_mullo_epi32(in[1], cospi8);
 1548|   808k|  y = _mm_mullo_epi32(in[7], cospi56);
 1549|   808k|  u7 = _mm_add_epi32(x, y);
 1550|   808k|  u7 = _mm_add_epi32(u7, rnding);
 1551|   808k|  u7 = _mm_srai_epi32(u7, bit);
 1552|       |
 1553|   808k|  x = _mm_mullo_epi32(in[5], cospi24);
 1554|   808k|  y = _mm_mullo_epi32(in[3], cospim40);
 1555|   808k|  u5 = _mm_add_epi32(x, y);
 1556|   808k|  u5 = _mm_add_epi32(u5, rnding);
 1557|   808k|  u5 = _mm_srai_epi32(u5, bit);
 1558|       |
 1559|   808k|  x = _mm_mullo_epi32(in[5], cospi40);
 1560|   808k|  y = _mm_mullo_epi32(in[3], cospi24);
 1561|   808k|  u6 = _mm_add_epi32(x, y);
 1562|   808k|  u6 = _mm_add_epi32(u6, rnding);
 1563|   808k|  u6 = _mm_srai_epi32(u6, bit);
 1564|       |
 1565|       |  // stage 3
 1566|   808k|  x = _mm_mullo_epi32(u0, cospi32);
 1567|   808k|  y = _mm_mullo_epi32(u1, cospi32);
 1568|   808k|  v0 = _mm_add_epi32(x, y);
 1569|   808k|  v0 = _mm_add_epi32(v0, rnding);
 1570|   808k|  v0 = _mm_srai_epi32(v0, bit);
 1571|       |
 1572|   808k|  v1 = _mm_sub_epi32(x, y);
 1573|   808k|  v1 = _mm_add_epi32(v1, rnding);
 1574|   808k|  v1 = _mm_srai_epi32(v1, bit);
 1575|       |
 1576|   808k|  x = _mm_mullo_epi32(u2, cospi48);
 1577|   808k|  y = _mm_mullo_epi32(u3, cospim16);
 1578|   808k|  v2 = _mm_add_epi32(x, y);
 1579|   808k|  v2 = _mm_add_epi32(v2, rnding);
 1580|   808k|  v2 = _mm_srai_epi32(v2, bit);
 1581|       |
 1582|   808k|  x = _mm_mullo_epi32(u2, cospi16);
 1583|   808k|  y = _mm_mullo_epi32(u3, cospi48);
 1584|   808k|  v3 = _mm_add_epi32(x, y);
 1585|   808k|  v3 = _mm_add_epi32(v3, rnding);
 1586|   808k|  v3 = _mm_srai_epi32(v3, bit);
 1587|       |
 1588|   808k|  addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
 1589|   808k|  addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
 1590|       |
 1591|       |  // stage 4
 1592|   808k|  addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
 1593|   808k|  addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
 1594|   808k|  u4 = v4;
 1595|   808k|  u7 = v7;
 1596|       |
 1597|   808k|  x = _mm_mullo_epi32(v5, cospi32);
 1598|   808k|  y = _mm_mullo_epi32(v6, cospi32);
 1599|   808k|  u6 = _mm_add_epi32(y, x);
 1600|   808k|  u6 = _mm_add_epi32(u6, rnding);
 1601|   808k|  u6 = _mm_srai_epi32(u6, bit);
 1602|       |
 1603|   808k|  u5 = _mm_sub_epi32(y, x);
 1604|   808k|  u5 = _mm_add_epi32(u5, rnding);
 1605|   808k|  u5 = _mm_srai_epi32(u5, bit);
 1606|       |
 1607|       |  // stage 5
 1608|   808k|  addsub_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi);
 1609|   808k|  addsub_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi);
 1610|   808k|  addsub_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi);
 1611|   808k|  addsub_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi);
 1612|       |
 1613|   808k|  if (!do_cols) {
  ------------------
  |  Branch (1613:7): [True: 542k, False: 266k]
  ------------------
 1614|   542k|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|   542k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 542k]
  |  |  ------------------
  ------------------
 1615|   542k|    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
 1616|   542k|    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
 1617|       |
 1618|   542k|    round_shift_4x4(out, out_shift);
 1619|   542k|    round_shift_4x4(out + 4, out_shift);
 1620|   542k|    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 8);
 1621|   542k|  }
 1622|   808k|}
highbd_inv_txfm_sse4.c:iadst8x8_low1_sse4_1:
 1625|  12.1k|                                 int do_cols, int bd, int out_shift) {
 1626|  12.1k|  const int32_t *cospi = cospi_arr(bit);
 1627|  12.1k|  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
 1628|  12.1k|  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
 1629|  12.1k|  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
 1630|  12.1k|  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
 1631|  12.1k|  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
 1632|  12.1k|  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
 1633|  12.1k|  const __m128i kZero = _mm_setzero_si128();
 1634|  12.1k|  __m128i u[8], x;
 1635|       |
 1636|       |  // stage 0
 1637|       |  // stage 1
 1638|       |  // stage 2
 1639|       |
 1640|  12.1k|  x = _mm_mullo_epi32(in[0], cospi60);
 1641|  12.1k|  u[0] = _mm_add_epi32(x, rnding);
 1642|  12.1k|  u[0] = _mm_srai_epi32(u[0], bit);
 1643|       |
 1644|  12.1k|  x = _mm_mullo_epi32(in[0], cospi4);
 1645|  12.1k|  u[1] = _mm_sub_epi32(kZero, x);
 1646|  12.1k|  u[1] = _mm_add_epi32(u[1], rnding);
 1647|  12.1k|  u[1] = _mm_srai_epi32(u[1], bit);
 1648|       |
 1649|       |  // stage 3
 1650|       |  // stage 4
 1651|  12.1k|  __m128i temp1, temp2;
 1652|  12.1k|  temp1 = _mm_mullo_epi32(u[0], cospi16);
 1653|  12.1k|  x = _mm_mullo_epi32(u[1], cospi48);
 1654|  12.1k|  temp1 = _mm_add_epi32(temp1, x);
 1655|  12.1k|  temp1 = _mm_add_epi32(temp1, rnding);
 1656|  12.1k|  temp1 = _mm_srai_epi32(temp1, bit);
 1657|  12.1k|  u[4] = temp1;
 1658|       |
 1659|  12.1k|  temp2 = _mm_mullo_epi32(u[0], cospi48);
 1660|  12.1k|  x = _mm_mullo_epi32(u[1], cospi16);
 1661|  12.1k|  u[5] = _mm_sub_epi32(temp2, x);
 1662|  12.1k|  u[5] = _mm_add_epi32(u[5], rnding);
 1663|  12.1k|  u[5] = _mm_srai_epi32(u[5], bit);
 1664|       |
 1665|       |  // stage 5
 1666|       |  // stage 6
 1667|  12.1k|  temp1 = _mm_mullo_epi32(u[0], cospi32);
 1668|  12.1k|  x = _mm_mullo_epi32(u[1], cospi32);
 1669|  12.1k|  u[2] = _mm_add_epi32(temp1, x);
 1670|  12.1k|  u[2] = _mm_add_epi32(u[2], rnding);
 1671|  12.1k|  u[2] = _mm_srai_epi32(u[2], bit);
 1672|       |
 1673|  12.1k|  u[3] = _mm_sub_epi32(temp1, x);
 1674|  12.1k|  u[3] = _mm_add_epi32(u[3], rnding);
 1675|  12.1k|  u[3] = _mm_srai_epi32(u[3], bit);
 1676|       |
 1677|  12.1k|  temp1 = _mm_mullo_epi32(u[4], cospi32);
 1678|  12.1k|  x = _mm_mullo_epi32(u[5], cospi32);
 1679|  12.1k|  u[6] = _mm_add_epi32(temp1, x);
 1680|  12.1k|  u[6] = _mm_add_epi32(u[6], rnding);
 1681|  12.1k|  u[6] = _mm_srai_epi32(u[6], bit);
 1682|       |
 1683|  12.1k|  u[7] = _mm_sub_epi32(temp1, x);
 1684|  12.1k|  u[7] = _mm_add_epi32(u[7], rnding);
 1685|  12.1k|  u[7] = _mm_srai_epi32(u[7], bit);
 1686|       |
 1687|       |  // stage 7
 1688|  12.1k|  if (do_cols) {
  ------------------
  |  Branch (1688:7): [True: 8.30k, False: 3.83k]
  ------------------
 1689|  8.30k|    out[0] = u[0];
 1690|  8.30k|    out[1] = _mm_sub_epi32(kZero, u[4]);
 1691|  8.30k|    out[2] = u[6];
 1692|  8.30k|    out[3] = _mm_sub_epi32(kZero, u[2]);
 1693|  8.30k|    out[4] = u[3];
 1694|  8.30k|    out[5] = _mm_sub_epi32(kZero, u[7]);
 1695|  8.30k|    out[6] = u[5];
 1696|  8.30k|    out[7] = _mm_sub_epi32(kZero, u[1]);
 1697|  8.30k|  } else {
 1698|  3.83k|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  3.83k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 3.83k]
  |  |  ------------------
  ------------------
 1699|  3.83k|    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
 1700|  3.83k|    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
 1701|       |
 1702|  3.83k|    neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
 1703|  3.83k|                     out_shift);
 1704|  3.83k|    neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
 1705|  3.83k|                     out_shift);
 1706|  3.83k|    neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
 1707|  3.83k|                     out_shift);
 1708|  3.83k|    neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
 1709|  3.83k|                     out_shift);
 1710|  3.83k|  }
 1711|  12.1k|}
highbd_inv_txfm_sse4.c:iadst8x8_new_sse4_1:
 1714|   650k|                                int bd, int out_shift) {
 1715|   650k|  const int32_t *cospi = cospi_arr(bit);
 1716|   650k|  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
 1717|   650k|  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
 1718|   650k|  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
 1719|   650k|  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
 1720|   650k|  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
 1721|   650k|  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
 1722|   650k|  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
 1723|   650k|  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
 1724|   650k|  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
 1725|   650k|  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
 1726|   650k|  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
 1727|   650k|  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
 1728|   650k|  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
 1729|   650k|  const __m128i kZero = _mm_setzero_si128();
 1730|   650k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|  1.30M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 650k]
  |  |  |  Branch (35:31): [True: 231k, False: 418k]
  |  |  |  Branch (35:44): [True: 231k, False: 418k]
  |  |  ------------------
  ------------------
 1731|   650k|  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
 1732|   650k|  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
 1733|   650k|  __m128i u[8], v[8], x;
 1734|       |
 1735|       |  // stage 0
 1736|       |  // stage 1
 1737|       |  // stage 2
 1738|       |
 1739|   650k|  u[0] = _mm_mullo_epi32(in[7], cospi4);
 1740|   650k|  x = _mm_mullo_epi32(in[0], cospi60);
 1741|   650k|  u[0] = _mm_add_epi32(u[0], x);
 1742|   650k|  u[0] = _mm_add_epi32(u[0], rnding);
 1743|   650k|  u[0] = _mm_srai_epi32(u[0], bit);
 1744|       |
 1745|   650k|  u[1] = _mm_mullo_epi32(in[7], cospi60);
 1746|   650k|  x = _mm_mullo_epi32(in[0], cospi4);
 1747|   650k|  u[1] = _mm_sub_epi32(u[1], x);
 1748|   650k|  u[1] = _mm_add_epi32(u[1], rnding);
 1749|   650k|  u[1] = _mm_srai_epi32(u[1], bit);
 1750|       |
 1751|       |  // (2)
 1752|   650k|  u[2] = _mm_mullo_epi32(in[5], cospi20);
 1753|   650k|  x = _mm_mullo_epi32(in[2], cospi44);
 1754|   650k|  u[2] = _mm_add_epi32(u[2], x);
 1755|   650k|  u[2] = _mm_add_epi32(u[2], rnding);
 1756|   650k|  u[2] = _mm_srai_epi32(u[2], bit);
 1757|       |
 1758|   650k|  u[3] = _mm_mullo_epi32(in[5], cospi44);
 1759|   650k|  x = _mm_mullo_epi32(in[2], cospi20);
 1760|   650k|  u[3] = _mm_sub_epi32(u[3], x);
 1761|   650k|  u[3] = _mm_add_epi32(u[3], rnding);
 1762|   650k|  u[3] = _mm_srai_epi32(u[3], bit);
 1763|       |
 1764|       |  // (3)
 1765|   650k|  u[4] = _mm_mullo_epi32(in[3], cospi36);
 1766|   650k|  x = _mm_mullo_epi32(in[4], cospi28);
 1767|   650k|  u[4] = _mm_add_epi32(u[4], x);
 1768|   650k|  u[4] = _mm_add_epi32(u[4], rnding);
 1769|   650k|  u[4] = _mm_srai_epi32(u[4], bit);
 1770|       |
 1771|   650k|  u[5] = _mm_mullo_epi32(in[3], cospi28);
 1772|   650k|  x = _mm_mullo_epi32(in[4], cospi36);
 1773|   650k|  u[5] = _mm_sub_epi32(u[5], x);
 1774|   650k|  u[5] = _mm_add_epi32(u[5], rnding);
 1775|   650k|  u[5] = _mm_srai_epi32(u[5], bit);
 1776|       |
 1777|       |  // (4)
 1778|   650k|  u[6] = _mm_mullo_epi32(in[1], cospi52);
 1779|   650k|  x = _mm_mullo_epi32(in[6], cospi12);
 1780|   650k|  u[6] = _mm_add_epi32(u[6], x);
 1781|   650k|  u[6] = _mm_add_epi32(u[6], rnding);
 1782|   650k|  u[6] = _mm_srai_epi32(u[6], bit);
 1783|       |
 1784|   650k|  u[7] = _mm_mullo_epi32(in[1], cospi12);
 1785|   650k|  x = _mm_mullo_epi32(in[6], cospi52);
 1786|   650k|  u[7] = _mm_sub_epi32(u[7], x);
 1787|   650k|  u[7] = _mm_add_epi32(u[7], rnding);
 1788|   650k|  u[7] = _mm_srai_epi32(u[7], bit);
 1789|       |
 1790|       |  // stage 3
 1791|   650k|  addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
 1792|   650k|  addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
 1793|   650k|  addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
 1794|   650k|  addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
 1795|       |
 1796|       |  // stage 4
 1797|   650k|  u[0] = v[0];
 1798|   650k|  u[1] = v[1];
 1799|   650k|  u[2] = v[2];
 1800|   650k|  u[3] = v[3];
 1801|       |
 1802|   650k|  u[4] = _mm_mullo_epi32(v[4], cospi16);
 1803|   650k|  x = _mm_mullo_epi32(v[5], cospi48);
 1804|   650k|  u[4] = _mm_add_epi32(u[4], x);
 1805|   650k|  u[4] = _mm_add_epi32(u[4], rnding);
 1806|   650k|  u[4] = _mm_srai_epi32(u[4], bit);
 1807|       |
 1808|   650k|  u[5] = _mm_mullo_epi32(v[4], cospi48);
 1809|   650k|  x = _mm_mullo_epi32(v[5], cospi16);
 1810|   650k|  u[5] = _mm_sub_epi32(u[5], x);
 1811|   650k|  u[5] = _mm_add_epi32(u[5], rnding);
 1812|   650k|  u[5] = _mm_srai_epi32(u[5], bit);
 1813|       |
 1814|   650k|  u[6] = _mm_mullo_epi32(v[6], cospim48);
 1815|   650k|  x = _mm_mullo_epi32(v[7], cospi16);
 1816|   650k|  u[6] = _mm_add_epi32(u[6], x);
 1817|   650k|  u[6] = _mm_add_epi32(u[6], rnding);
 1818|   650k|  u[6] = _mm_srai_epi32(u[6], bit);
 1819|       |
 1820|   650k|  u[7] = _mm_mullo_epi32(v[6], cospi16);
 1821|   650k|  x = _mm_mullo_epi32(v[7], cospim48);
 1822|   650k|  u[7] = _mm_sub_epi32(u[7], x);
 1823|   650k|  u[7] = _mm_add_epi32(u[7], rnding);
 1824|   650k|  u[7] = _mm_srai_epi32(u[7], bit);
 1825|       |
 1826|       |  // stage 5
 1827|   650k|  addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
 1828|   650k|  addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
 1829|   650k|  addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
 1830|   650k|  addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
 1831|       |
 1832|       |  // stage 6
 1833|   650k|  u[0] = v[0];
 1834|   650k|  u[1] = v[1];
 1835|   650k|  u[4] = v[4];
 1836|   650k|  u[5] = v[5];
 1837|       |
 1838|   650k|  v[0] = _mm_mullo_epi32(v[2], cospi32);
 1839|   650k|  x = _mm_mullo_epi32(v[3], cospi32);
 1840|   650k|  u[2] = _mm_add_epi32(v[0], x);
 1841|   650k|  u[2] = _mm_add_epi32(u[2], rnding);
 1842|   650k|  u[2] = _mm_srai_epi32(u[2], bit);
 1843|       |
 1844|   650k|  u[3] = _mm_sub_epi32(v[0], x);
 1845|   650k|  u[3] = _mm_add_epi32(u[3], rnding);
 1846|   650k|  u[3] = _mm_srai_epi32(u[3], bit);
 1847|       |
 1848|   650k|  v[0] = _mm_mullo_epi32(v[6], cospi32);
 1849|   650k|  x = _mm_mullo_epi32(v[7], cospi32);
 1850|   650k|  u[6] = _mm_add_epi32(v[0], x);
 1851|   650k|  u[6] = _mm_add_epi32(u[6], rnding);
 1852|   650k|  u[6] = _mm_srai_epi32(u[6], bit);
 1853|       |
 1854|   650k|  u[7] = _mm_sub_epi32(v[0], x);
 1855|   650k|  u[7] = _mm_add_epi32(u[7], rnding);
 1856|   650k|  u[7] = _mm_srai_epi32(u[7], bit);
 1857|       |
 1858|       |  // stage 7
 1859|   650k|  if (do_cols) {
  ------------------
  |  Branch (1859:7): [True: 231k, False: 418k]
  ------------------
 1860|   231k|    out[0] = u[0];
 1861|   231k|    out[1] = _mm_sub_epi32(kZero, u[4]);
 1862|   231k|    out[2] = u[6];
 1863|   231k|    out[3] = _mm_sub_epi32(kZero, u[2]);
 1864|   231k|    out[4] = u[3];
 1865|   231k|    out[5] = _mm_sub_epi32(kZero, u[7]);
 1866|   231k|    out[6] = u[5];
 1867|   231k|    out[7] = _mm_sub_epi32(kZero, u[1]);
 1868|   418k|  } else {
 1869|   418k|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|   418k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 418k]
  |  |  ------------------
  ------------------
 1870|   418k|    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
 1871|   418k|    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
 1872|       |
 1873|   418k|    neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
 1874|   418k|                     out_shift);
 1875|   418k|    neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
 1876|   418k|                     out_shift);
 1877|   418k|    neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
 1878|   418k|                     out_shift);
 1879|   418k|    neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
 1880|   418k|                     out_shift);
 1881|   418k|  }
 1882|   650k|}
highbd_inv_txfm_sse4.c:iidentity8_sse4_1:
 1307|  2.10M|                              int bd, int out_shift) {
 1308|  2.10M|  (void)bit;
 1309|  2.10M|  out[0] = _mm_add_epi32(in[0], in[0]);
 1310|  2.10M|  out[1] = _mm_add_epi32(in[1], in[1]);
 1311|  2.10M|  out[2] = _mm_add_epi32(in[2], in[2]);
 1312|  2.10M|  out[3] = _mm_add_epi32(in[3], in[3]);
 1313|  2.10M|  out[4] = _mm_add_epi32(in[4], in[4]);
 1314|  2.10M|  out[5] = _mm_add_epi32(in[5], in[5]);
 1315|  2.10M|  out[6] = _mm_add_epi32(in[6], in[6]);
 1316|  2.10M|  out[7] = _mm_add_epi32(in[7], in[7]);
 1317|       |
 1318|  2.10M|  if (!do_cols) {
  ------------------
  |  Branch (1318:7): [True: 915k, False: 1.19M]
  ------------------
 1319|   915k|    const int log_range = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|   915k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 915k]
  |  |  ------------------
  ------------------
 1320|   915k|    const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
 1321|   915k|    const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
 1322|   915k|    round_shift_4x4(out, out_shift);
 1323|   915k|    round_shift_4x4(out + 4, out_shift);
 1324|   915k|    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 8);
 1325|   915k|  }
 1326|  2.10M|}
highbd_inv_txfm_sse4.c:idct16x16_low1_sse4_1:
 1885|  14.6k|                                  int do_cols, int bd, int out_shift) {
 1886|  14.6k|  const int32_t *cospi = cospi_arr(bit);
 1887|  14.6k|  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
 1888|  14.6k|  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
 1889|  14.6k|  int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|  29.3k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 14.6k]
  |  |  |  Branch (35:31): [True: 3.02k, False: 11.6k]
  |  |  |  Branch (35:44): [True: 3.02k, False: 11.6k]
  |  |  ------------------
  ------------------
 1890|  14.6k|  __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
 1891|  14.6k|  __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
 1892|       |  // stage 0
 1893|       |  // stage 1
 1894|       |  // stage 2
 1895|       |  // stage 3
 1896|       |  // stage 4
 1897|  14.6k|  in[0] = _mm_mullo_epi32(in[0], cospi32);
 1898|  14.6k|  in[0] = _mm_add_epi32(in[0], rnding);
 1899|  14.6k|  in[0] = _mm_srai_epi32(in[0], bit);
 1900|       |
 1901|       |  // stage 5
 1902|       |  // stage 6
 1903|       |  // stage 7
 1904|  14.6k|  if (!do_cols) {
  ------------------
  |  Branch (1904:7): [True: 11.6k, False: 3.02k]
  ------------------
 1905|  11.6k|    log_range = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  11.6k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 11.6k]
  |  |  ------------------
  ------------------
 1906|  11.6k|    clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
 1907|  11.6k|    clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
 1908|  11.6k|    if (out_shift != 0) {
  ------------------
  |  Branch (1908:9): [True: 11.6k, False: 0]
  ------------------
 1909|  11.6k|      __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
 1910|  11.6k|      in[0] = _mm_add_epi32(in[0], offset);
 1911|  11.6k|      in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
 1912|  11.6k|    }
 1913|  11.6k|  }
 1914|       |
 1915|  14.6k|  in[0] = _mm_max_epi32(in[0], clamp_lo);
 1916|  14.6k|  in[0] = _mm_min_epi32(in[0], clamp_hi);
 1917|  14.6k|  out[0] = in[0];
 1918|  14.6k|  out[1] = in[0];
 1919|  14.6k|  out[2] = in[0];
 1920|  14.6k|  out[3] = in[0];
 1921|  14.6k|  out[4] = in[0];
 1922|  14.6k|  out[5] = in[0];
 1923|  14.6k|  out[6] = in[0];
 1924|  14.6k|  out[7] = in[0];
 1925|  14.6k|  out[8] = in[0];
 1926|  14.6k|  out[9] = in[0];
 1927|  14.6k|  out[10] = in[0];
 1928|  14.6k|  out[11] = in[0];
 1929|  14.6k|  out[12] = in[0];
 1930|  14.6k|  out[13] = in[0];
 1931|  14.6k|  out[14] = in[0];
 1932|  14.6k|  out[15] = in[0];
 1933|  14.6k|}
highbd_inv_txfm_sse4.c:idct16x16_low8_sse4_1:
 1936|  35.5k|                                  int do_cols, int bd, int out_shift) {
 1937|  35.5k|  const int32_t *cospi = cospi_arr(bit);
 1938|  35.5k|  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
 1939|  35.5k|  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
 1940|  35.5k|  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
 1941|  35.5k|  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
 1942|  35.5k|  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
 1943|  35.5k|  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
 1944|  35.5k|  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
 1945|  35.5k|  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
 1946|  35.5k|  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
 1947|  35.5k|  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
 1948|  35.5k|  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
 1949|  35.5k|  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
 1950|  35.5k|  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
 1951|  35.5k|  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
 1952|  35.5k|  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
 1953|  35.5k|  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
 1954|  35.5k|  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
 1955|  35.5k|  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
 1956|  35.5k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|  71.1k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 35.5k]
  |  |  |  Branch (35:31): [True: 4.95k, False: 30.6k]
  |  |  |  Branch (35:44): [True: 4.95k, False: 30.6k]
  |  |  ------------------
  ------------------
 1957|  35.5k|  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
 1958|  35.5k|  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
 1959|  35.5k|  __m128i u[16], x, y;
 1960|       |  // stage 0
 1961|       |  // stage 1
 1962|  35.5k|  u[0] = in[0];
 1963|  35.5k|  u[2] = in[4];
 1964|  35.5k|  u[4] = in[2];
 1965|  35.5k|  u[6] = in[6];
 1966|  35.5k|  u[8] = in[1];
 1967|  35.5k|  u[10] = in[5];
 1968|  35.5k|  u[12] = in[3];
 1969|  35.5k|  u[14] = in[7];
 1970|       |
 1971|       |  // stage 2
 1972|  35.5k|  u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
 1973|  35.5k|  u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
 1974|       |
 1975|  35.5k|  u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
 1976|  35.5k|  u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
 1977|       |
 1978|  35.5k|  u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
 1979|  35.5k|  u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
 1980|       |
 1981|  35.5k|  u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
 1982|  35.5k|  u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
 1983|       |
 1984|       |  // stage 3
 1985|  35.5k|  u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
 1986|  35.5k|  u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
 1987|  35.5k|  u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit);
 1988|  35.5k|  u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit);
 1989|       |
 1990|  35.5k|  addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
 1991|  35.5k|  addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
 1992|  35.5k|  addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
 1993|  35.5k|  addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
 1994|       |
 1995|       |  // stage 4
 1996|  35.5k|  x = _mm_mullo_epi32(u[0], cospi32);
 1997|  35.5k|  u[0] = _mm_add_epi32(x, rnding);
 1998|  35.5k|  u[0] = _mm_srai_epi32(u[0], bit);
 1999|  35.5k|  u[1] = u[0];
 2000|       |
 2001|  35.5k|  u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
 2002|  35.5k|  u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
 2003|       |
 2004|  35.5k|  addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
 2005|  35.5k|  addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
 2006|       |
 2007|  35.5k|  x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
 2008|  35.5k|  u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
 2009|  35.5k|  u[9] = x;
 2010|  35.5k|  y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
 2011|  35.5k|  u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
 2012|  35.5k|  u[10] = y;
 2013|       |
 2014|       |  // stage 5
 2015|  35.5k|  addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
 2016|  35.5k|  addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
 2017|       |
 2018|  35.5k|  x = _mm_mullo_epi32(u[5], cospi32);
 2019|  35.5k|  y = _mm_mullo_epi32(u[6], cospi32);
 2020|  35.5k|  u[5] = _mm_sub_epi32(y, x);
 2021|  35.5k|  u[5] = _mm_add_epi32(u[5], rnding);
 2022|  35.5k|  u[5] = _mm_srai_epi32(u[5], bit);
 2023|       |
 2024|  35.5k|  u[6] = _mm_add_epi32(y, x);
 2025|  35.5k|  u[6] = _mm_add_epi32(u[6], rnding);
 2026|  35.5k|  u[6] = _mm_srai_epi32(u[6], bit);
 2027|       |
 2028|  35.5k|  addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
 2029|  35.5k|  addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
 2030|  35.5k|  addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
 2031|  35.5k|  addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
 2032|       |
 2033|       |  // stage 6
 2034|  35.5k|  addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
 2035|  35.5k|  addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
 2036|  35.5k|  addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
 2037|  35.5k|  addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
 2038|       |
 2039|  35.5k|  x = _mm_mullo_epi32(u[10], cospi32);
 2040|  35.5k|  y = _mm_mullo_epi32(u[13], cospi32);
 2041|  35.5k|  u[10] = _mm_sub_epi32(y, x);
 2042|  35.5k|  u[10] = _mm_add_epi32(u[10], rnding);
 2043|  35.5k|  u[10] = _mm_srai_epi32(u[10], bit);
 2044|       |
 2045|  35.5k|  u[13] = _mm_add_epi32(x, y);
 2046|  35.5k|  u[13] = _mm_add_epi32(u[13], rnding);
 2047|  35.5k|  u[13] = _mm_srai_epi32(u[13], bit);
 2048|       |
 2049|  35.5k|  x = _mm_mullo_epi32(u[11], cospi32);
 2050|  35.5k|  y = _mm_mullo_epi32(u[12], cospi32);
 2051|  35.5k|  u[11] = _mm_sub_epi32(y, x);
 2052|  35.5k|  u[11] = _mm_add_epi32(u[11], rnding);
 2053|  35.5k|  u[11] = _mm_srai_epi32(u[11], bit);
 2054|       |
 2055|  35.5k|  u[12] = _mm_add_epi32(x, y);
 2056|  35.5k|  u[12] = _mm_add_epi32(u[12], rnding);
 2057|  35.5k|  u[12] = _mm_srai_epi32(u[12], bit);
 2058|       |  // stage 7
 2059|  35.5k|  addsub_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
 2060|  35.5k|  addsub_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
 2061|  35.5k|  addsub_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
 2062|  35.5k|  addsub_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
 2063|  35.5k|  addsub_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
 2064|  35.5k|  addsub_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
 2065|  35.5k|  addsub_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
 2066|  35.5k|  addsub_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
 2067|       |
 2068|  35.5k|  if (!do_cols) {
  ------------------
  |  Branch (2068:7): [True: 30.6k, False: 4.95k]
  ------------------
 2069|  30.6k|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  30.6k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 30.6k]
  |  |  ------------------
  ------------------
 2070|  30.6k|    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
 2071|  30.6k|    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
 2072|  30.6k|    round_shift_8x8(out, out_shift);
 2073|  30.6k|    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
 2074|  30.6k|  }
 2075|  35.5k|}
highbd_inv_txfm_sse4.c:idct16x16_sse4_1:
 2567|   668k|                             int bd, int out_shift) {
 2568|   668k|  const int32_t *cospi = cospi_arr(bit);
 2569|   668k|  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
 2570|   668k|  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
 2571|   668k|  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
 2572|   668k|  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
 2573|   668k|  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
 2574|   668k|  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
 2575|   668k|  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
 2576|   668k|  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
 2577|   668k|  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
 2578|   668k|  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
 2579|   668k|  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
 2580|   668k|  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
 2581|   668k|  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
 2582|   668k|  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
 2583|   668k|  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
 2584|   668k|  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
 2585|   668k|  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
 2586|   668k|  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
 2587|   668k|  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
 2588|   668k|  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
 2589|   668k|  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
 2590|   668k|  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
 2591|   668k|  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
 2592|   668k|  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
 2593|   668k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|  1.33M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 668k]
  |  |  |  Branch (35:31): [True: 202k, False: 465k]
  |  |  |  Branch (35:44): [True: 202k, False: 465k]
  |  |  ------------------
  ------------------
 2594|   668k|  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
 2595|   668k|  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
 2596|   668k|  __m128i u[16], v[16], x, y;
 2597|       |
 2598|   668k|  {
 2599|       |    // stage 0
 2600|       |    // stage 1
 2601|   668k|    u[0] = in[0];
 2602|   668k|    u[1] = in[8];
 2603|   668k|    u[2] = in[4];
 2604|   668k|    u[3] = in[12];
 2605|   668k|    u[4] = in[2];
 2606|   668k|    u[5] = in[10];
 2607|   668k|    u[6] = in[6];
 2608|   668k|    u[7] = in[14];
 2609|   668k|    u[8] = in[1];
 2610|   668k|    u[9] = in[9];
 2611|   668k|    u[10] = in[5];
 2612|   668k|    u[11] = in[13];
 2613|   668k|    u[12] = in[3];
 2614|   668k|    u[13] = in[11];
 2615|   668k|    u[14] = in[7];
 2616|   668k|    u[15] = in[15];
 2617|       |
 2618|       |    // stage 2
 2619|   668k|    v[0] = u[0];
 2620|   668k|    v[1] = u[1];
 2621|   668k|    v[2] = u[2];
 2622|   668k|    v[3] = u[3];
 2623|   668k|    v[4] = u[4];
 2624|   668k|    v[5] = u[5];
 2625|   668k|    v[6] = u[6];
 2626|   668k|    v[7] = u[7];
 2627|       |
 2628|   668k|    v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
 2629|   668k|    v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
 2630|   668k|    v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
 2631|   668k|    v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
 2632|   668k|    v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
 2633|   668k|    v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
 2634|   668k|    v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
 2635|   668k|    v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
 2636|       |
 2637|       |    // stage 3
 2638|   668k|    u[0] = v[0];
 2639|   668k|    u[1] = v[1];
 2640|   668k|    u[2] = v[2];
 2641|   668k|    u[3] = v[3];
 2642|   668k|    u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
 2643|   668k|    u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
 2644|   668k|    u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
 2645|   668k|    u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
 2646|   668k|    addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
 2647|   668k|    addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
 2648|   668k|    addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
 2649|   668k|    addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
 2650|       |
 2651|       |    // stage 4
 2652|   668k|    x = _mm_mullo_epi32(u[0], cospi32);
 2653|   668k|    y = _mm_mullo_epi32(u[1], cospi32);
 2654|   668k|    v[0] = _mm_add_epi32(x, y);
 2655|   668k|    v[0] = _mm_add_epi32(v[0], rnding);
 2656|   668k|    v[0] = _mm_srai_epi32(v[0], bit);
 2657|       |
 2658|   668k|    v[1] = _mm_sub_epi32(x, y);
 2659|   668k|    v[1] = _mm_add_epi32(v[1], rnding);
 2660|   668k|    v[1] = _mm_srai_epi32(v[1], bit);
 2661|       |
 2662|   668k|    v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
 2663|   668k|    v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
 2664|   668k|    addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
 2665|   668k|    addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
 2666|   668k|    v[8] = u[8];
 2667|   668k|    v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
 2668|   668k|    v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
 2669|   668k|    v[11] = u[11];
 2670|   668k|    v[12] = u[12];
 2671|   668k|    v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
 2672|   668k|    v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
 2673|   668k|    v[15] = u[15];
 2674|       |
 2675|       |    // stage 5
 2676|   668k|    addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
 2677|   668k|    addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
 2678|   668k|    u[4] = v[4];
 2679|       |
 2680|   668k|    x = _mm_mullo_epi32(v[5], cospi32);
 2681|   668k|    y = _mm_mullo_epi32(v[6], cospi32);
 2682|   668k|    u[5] = _mm_sub_epi32(y, x);
 2683|   668k|    u[5] = _mm_add_epi32(u[5], rnding);
 2684|   668k|    u[5] = _mm_srai_epi32(u[5], bit);
 2685|       |
 2686|   668k|    u[6] = _mm_add_epi32(y, x);
 2687|   668k|    u[6] = _mm_add_epi32(u[6], rnding);
 2688|   668k|    u[6] = _mm_srai_epi32(u[6], bit);
 2689|       |
 2690|   668k|    u[7] = v[7];
 2691|   668k|    addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
 2692|   668k|    addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
 2693|   668k|    addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
 2694|   668k|    addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
 2695|       |
 2696|       |    // stage 6
 2697|   668k|    addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
 2698|   668k|    addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
 2699|   668k|    addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
 2700|   668k|    addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
 2701|   668k|    v[8] = u[8];
 2702|   668k|    v[9] = u[9];
 2703|       |
 2704|   668k|    x = _mm_mullo_epi32(u[10], cospi32);
 2705|   668k|    y = _mm_mullo_epi32(u[13], cospi32);
 2706|   668k|    v[10] = _mm_sub_epi32(y, x);
 2707|   668k|    v[10] = _mm_add_epi32(v[10], rnding);
 2708|   668k|    v[10] = _mm_srai_epi32(v[10], bit);
 2709|       |
 2710|   668k|    v[13] = _mm_add_epi32(x, y);
 2711|   668k|    v[13] = _mm_add_epi32(v[13], rnding);
 2712|   668k|    v[13] = _mm_srai_epi32(v[13], bit);
 2713|       |
 2714|   668k|    x = _mm_mullo_epi32(u[11], cospi32);
 2715|   668k|    y = _mm_mullo_epi32(u[12], cospi32);
 2716|   668k|    v[11] = _mm_sub_epi32(y, x);
 2717|   668k|    v[11] = _mm_add_epi32(v[11], rnding);
 2718|   668k|    v[11] = _mm_srai_epi32(v[11], bit);
 2719|       |
 2720|   668k|    v[12] = _mm_add_epi32(x, y);
 2721|   668k|    v[12] = _mm_add_epi32(v[12], rnding);
 2722|   668k|    v[12] = _mm_srai_epi32(v[12], bit);
 2723|       |
 2724|   668k|    v[14] = u[14];
 2725|   668k|    v[15] = u[15];
 2726|       |
 2727|       |    // stage 7
 2728|   668k|    addsub_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
 2729|   668k|    addsub_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
 2730|   668k|    addsub_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
 2731|   668k|    addsub_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
 2732|   668k|    addsub_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
 2733|   668k|    addsub_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
 2734|   668k|    addsub_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
 2735|   668k|    addsub_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
 2736|       |
 2737|   668k|    if (!do_cols) {
  ------------------
  |  Branch (2737:9): [True: 465k, False: 202k]
  ------------------
 2738|   465k|      const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|   465k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 465k]
  |  |  ------------------
  ------------------
 2739|   465k|      const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
 2740|   465k|      const __m128i clamp_hi_out =
 2741|   465k|          _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
 2742|   465k|      round_shift_8x8(out, out_shift);
 2743|   465k|      highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
 2744|   465k|    }
 2745|   668k|  }
 2746|   668k|}
highbd_inv_txfm_sse4.c:iadst16x16_low1_sse4_1:
 2078|  1.76k|                                   int do_cols, int bd, int out_shift) {
 2079|  1.76k|  const int32_t *cospi = cospi_arr(bit);
 2080|  1.76k|  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
 2081|  1.76k|  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
 2082|  1.76k|  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
 2083|  1.76k|  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
 2084|  1.76k|  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
 2085|  1.76k|  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
 2086|  1.76k|  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
 2087|  1.76k|  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
 2088|  1.76k|  const __m128i zero = _mm_setzero_si128();
 2089|  1.76k|  __m128i v[16], x, y, temp1, temp2;
 2090|       |  // stage 0
 2091|       |  // stage 1
 2092|       |  // stage 2
 2093|  1.76k|  x = _mm_mullo_epi32(in[0], cospi62);
 2094|  1.76k|  v[0] = _mm_add_epi32(x, rnding);
 2095|  1.76k|  v[0] = _mm_srai_epi32(v[0], bit);
 2096|       |
 2097|  1.76k|  x = _mm_mullo_epi32(in[0], cospi2);
 2098|  1.76k|  v[1] = _mm_sub_epi32(zero, x);
 2099|  1.76k|  v[1] = _mm_add_epi32(v[1], rnding);
 2100|  1.76k|  v[1] = _mm_srai_epi32(v[1], bit);
 2101|       |
 2102|       |  // stage 3
 2103|  1.76k|  v[8] = v[0];
 2104|  1.76k|  v[9] = v[1];
 2105|       |
 2106|       |  // stage 4
 2107|  1.76k|  temp1 = _mm_mullo_epi32(v[8], cospi8);
 2108|  1.76k|  x = _mm_mullo_epi32(v[9], cospi56);
 2109|  1.76k|  temp1 = _mm_add_epi32(temp1, x);
 2110|  1.76k|  temp1 = _mm_add_epi32(temp1, rnding);
 2111|  1.76k|  temp1 = _mm_srai_epi32(temp1, bit);
 2112|       |
 2113|  1.76k|  temp2 = _mm_mullo_epi32(v[8], cospi56);
 2114|  1.76k|  x = _mm_mullo_epi32(v[9], cospi8);
 2115|  1.76k|  temp2 = _mm_sub_epi32(temp2, x);
 2116|  1.76k|  temp2 = _mm_add_epi32(temp2, rnding);
 2117|  1.76k|  temp2 = _mm_srai_epi32(temp2, bit);
 2118|  1.76k|  v[8] = temp1;
 2119|  1.76k|  v[9] = temp2;
 2120|       |
 2121|       |  // stage 5
 2122|  1.76k|  v[4] = v[0];
 2123|  1.76k|  v[5] = v[1];
 2124|  1.76k|  v[12] = v[8];
 2125|  1.76k|  v[13] = v[9];
 2126|       |
 2127|       |  // stage 6
 2128|  1.76k|  temp1 = _mm_mullo_epi32(v[4], cospi16);
 2129|  1.76k|  x = _mm_mullo_epi32(v[5], cospi48);
 2130|  1.76k|  temp1 = _mm_add_epi32(temp1, x);
 2131|  1.76k|  temp1 = _mm_add_epi32(temp1, rnding);
 2132|  1.76k|  temp1 = _mm_srai_epi32(temp1, bit);
 2133|       |
 2134|  1.76k|  temp2 = _mm_mullo_epi32(v[4], cospi48);
 2135|  1.76k|  x = _mm_mullo_epi32(v[5], cospi16);
 2136|  1.76k|  temp2 = _mm_sub_epi32(temp2, x);
 2137|  1.76k|  temp2 = _mm_add_epi32(temp2, rnding);
 2138|  1.76k|  temp2 = _mm_srai_epi32(temp2, bit);
 2139|  1.76k|  v[4] = temp1;
 2140|  1.76k|  v[5] = temp2;
 2141|       |
 2142|  1.76k|  temp1 = _mm_mullo_epi32(v[12], cospi16);
 2143|  1.76k|  x = _mm_mullo_epi32(v[13], cospi48);
 2144|  1.76k|  temp1 = _mm_add_epi32(temp1, x);
 2145|  1.76k|  temp1 = _mm_add_epi32(temp1, rnding);
 2146|  1.76k|  temp1 = _mm_srai_epi32(temp1, bit);
 2147|       |
 2148|  1.76k|  temp2 = _mm_mullo_epi32(v[12], cospi48);
 2149|  1.76k|  x = _mm_mullo_epi32(v[13], cospi16);
 2150|  1.76k|  temp2 = _mm_sub_epi32(temp2, x);
 2151|  1.76k|  temp2 = _mm_add_epi32(temp2, rnding);
 2152|  1.76k|  temp2 = _mm_srai_epi32(temp2, bit);
 2153|  1.76k|  v[12] = temp1;
 2154|  1.76k|  v[13] = temp2;
 2155|       |
 2156|       |  // stage 7
 2157|  1.76k|  v[2] = v[0];
 2158|  1.76k|  v[3] = v[1];
 2159|  1.76k|  v[6] = v[4];
 2160|  1.76k|  v[7] = v[5];
 2161|  1.76k|  v[10] = v[8];
 2162|  1.76k|  v[11] = v[9];
 2163|  1.76k|  v[14] = v[12];
 2164|  1.76k|  v[15] = v[13];
 2165|       |
 2166|       |  // stage 8
 2167|  1.76k|  y = _mm_mullo_epi32(v[2], cospi32);
 2168|  1.76k|  x = _mm_mullo_epi32(v[3], cospi32);
 2169|  1.76k|  v[2] = _mm_add_epi32(y, x);
 2170|  1.76k|  v[2] = _mm_add_epi32(v[2], rnding);
 2171|  1.76k|  v[2] = _mm_srai_epi32(v[2], bit);
 2172|       |
 2173|  1.76k|  v[3] = _mm_sub_epi32(y, x);
 2174|  1.76k|  v[3] = _mm_add_epi32(v[3], rnding);
 2175|  1.76k|  v[3] = _mm_srai_epi32(v[3], bit);
 2176|       |
 2177|  1.76k|  y = _mm_mullo_epi32(v[6], cospi32);
 2178|  1.76k|  x = _mm_mullo_epi32(v[7], cospi32);
 2179|  1.76k|  v[6] = _mm_add_epi32(y, x);
 2180|  1.76k|  v[6] = _mm_add_epi32(v[6], rnding);
 2181|  1.76k|  v[6] = _mm_srai_epi32(v[6], bit);
 2182|       |
 2183|  1.76k|  v[7] = _mm_sub_epi32(y, x);
 2184|  1.76k|  v[7] = _mm_add_epi32(v[7], rnding);
 2185|  1.76k|  v[7] = _mm_srai_epi32(v[7], bit);
 2186|       |
 2187|  1.76k|  y = _mm_mullo_epi32(v[10], cospi32);
 2188|  1.76k|  x = _mm_mullo_epi32(v[11], cospi32);
 2189|  1.76k|  v[10] = _mm_add_epi32(y, x);
 2190|  1.76k|  v[10] = _mm_add_epi32(v[10], rnding);
 2191|  1.76k|  v[10] = _mm_srai_epi32(v[10], bit);
 2192|       |
 2193|  1.76k|  v[11] = _mm_sub_epi32(y, x);
 2194|  1.76k|  v[11] = _mm_add_epi32(v[11], rnding);
 2195|  1.76k|  v[11] = _mm_srai_epi32(v[11], bit);
 2196|       |
 2197|  1.76k|  y = _mm_mullo_epi32(v[14], cospi32);
 2198|  1.76k|  x = _mm_mullo_epi32(v[15], cospi32);
 2199|  1.76k|  v[14] = _mm_add_epi32(y, x);
 2200|  1.76k|  v[14] = _mm_add_epi32(v[14], rnding);
 2201|  1.76k|  v[14] = _mm_srai_epi32(v[14], bit);
 2202|       |
 2203|  1.76k|  v[15] = _mm_sub_epi32(y, x);
 2204|  1.76k|  v[15] = _mm_add_epi32(v[15], rnding);
 2205|  1.76k|  v[15] = _mm_srai_epi32(v[15], bit);
 2206|       |
 2207|       |  // stage 9
 2208|  1.76k|  if (do_cols) {
  ------------------
  |  Branch (2208:7): [True: 622, False: 1.14k]
  ------------------
 2209|    622|    out[0] = v[0];
 2210|    622|    out[1] = _mm_sub_epi32(zero, v[8]);
 2211|    622|    out[2] = v[12];
 2212|    622|    out[3] = _mm_sub_epi32(zero, v[4]);
 2213|    622|    out[4] = v[6];
 2214|    622|    out[5] = _mm_sub_epi32(zero, v[14]);
 2215|    622|    out[6] = v[10];
 2216|    622|    out[7] = _mm_sub_epi32(zero, v[2]);
 2217|    622|    out[8] = v[3];
 2218|    622|    out[9] = _mm_sub_epi32(zero, v[11]);
 2219|    622|    out[10] = v[15];
 2220|    622|    out[11] = _mm_sub_epi32(zero, v[7]);
 2221|    622|    out[12] = v[5];
 2222|    622|    out[13] = _mm_sub_epi32(zero, v[13]);
 2223|    622|    out[14] = v[9];
 2224|    622|    out[15] = _mm_sub_epi32(zero, v[1]);
 2225|  1.14k|  } else {
 2226|  1.14k|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  1.14k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 1.14k]
  |  |  ------------------
  ------------------
 2227|  1.14k|    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
 2228|  1.14k|    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
 2229|       |
 2230|  1.14k|    neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
 2231|  1.14k|                     out_shift);
 2232|  1.14k|    neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
 2233|  1.14k|                     &clamp_hi_out, out_shift);
 2234|  1.14k|    neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
 2235|  1.14k|                     &clamp_hi_out, out_shift);
 2236|  1.14k|    neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
 2237|  1.14k|                     &clamp_hi_out, out_shift);
 2238|  1.14k|    neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
 2239|  1.14k|                     &clamp_hi_out, out_shift);
 2240|  1.14k|    neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
 2241|  1.14k|                     &clamp_hi_out, out_shift);
 2242|  1.14k|    neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
 2243|  1.14k|                     &clamp_hi_out, out_shift);
 2244|  1.14k|    neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
 2245|  1.14k|                     &clamp_hi_out, out_shift);
 2246|  1.14k|  }
 2247|  1.76k|}
highbd_inv_txfm_sse4.c:iadst16x16_low8_sse4_1:
 2250|  7.89k|                                   int do_cols, int bd, int out_shift) {
 2251|  7.89k|  const int32_t *cospi = cospi_arr(bit);
 2252|  7.89k|  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
 2253|  7.89k|  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
 2254|  7.89k|  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
 2255|  7.89k|  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
 2256|  7.89k|  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
 2257|  7.89k|  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
 2258|  7.89k|  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
 2259|  7.89k|  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
 2260|  7.89k|  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
 2261|  7.89k|  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
 2262|  7.89k|  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
 2263|  7.89k|  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
 2264|  7.89k|  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
 2265|  7.89k|  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
 2266|  7.89k|  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
 2267|  7.89k|  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
 2268|  7.89k|  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
 2269|  7.89k|  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
 2270|  7.89k|  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
 2271|  7.89k|  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
 2272|  7.89k|  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
 2273|  7.89k|  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
 2274|  7.89k|  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
 2275|  7.89k|  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
 2276|  7.89k|  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
 2277|  7.89k|  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
 2278|  7.89k|  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
 2279|  7.89k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|  15.7k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 7.89k]
  |  |  |  Branch (35:31): [True: 1.62k, False: 6.27k]
  |  |  |  Branch (35:44): [True: 1.62k, False: 6.27k]
  |  |  ------------------
  ------------------
 2280|  7.89k|  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
 2281|  7.89k|  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
 2282|  7.89k|  __m128i zero = _mm_setzero_si128();
 2283|  7.89k|  __m128i u[16], x, y;
 2284|       |
 2285|       |  // stage 0
 2286|       |  // stage 1
 2287|       |  // stage 2
 2288|  7.89k|  x = _mm_mullo_epi32(in[0], cospi62);
 2289|  7.89k|  u[0] = _mm_add_epi32(x, rnding);
 2290|  7.89k|  u[0] = _mm_srai_epi32(u[0], bit);
 2291|       |
 2292|  7.89k|  x = _mm_mullo_epi32(in[0], cospi2);
 2293|  7.89k|  u[1] = _mm_sub_epi32(zero, x);
 2294|  7.89k|  u[1] = _mm_add_epi32(u[1], rnding);
 2295|  7.89k|  u[1] = _mm_srai_epi32(u[1], bit);
 2296|       |
 2297|  7.89k|  x = _mm_mullo_epi32(in[2], cospi54);
 2298|  7.89k|  u[2] = _mm_add_epi32(x, rnding);
 2299|  7.89k|  u[2] = _mm_srai_epi32(u[2], bit);
 2300|       |
 2301|  7.89k|  x = _mm_mullo_epi32(in[2], cospi10);
 2302|  7.89k|  u[3] = _mm_sub_epi32(zero, x);
 2303|  7.89k|  u[3] = _mm_add_epi32(u[3], rnding);
 2304|  7.89k|  u[3] = _mm_srai_epi32(u[3], bit);
 2305|       |
 2306|  7.89k|  x = _mm_mullo_epi32(in[4], cospi46);
 2307|  7.89k|  u[4] = _mm_add_epi32(x, rnding);
 2308|  7.89k|  u[4] = _mm_srai_epi32(u[4], bit);
 2309|       |
 2310|  7.89k|  x = _mm_mullo_epi32(in[4], cospi18);
 2311|  7.89k|  u[5] = _mm_sub_epi32(zero, x);
 2312|  7.89k|  u[5] = _mm_add_epi32(u[5], rnding);
 2313|  7.89k|  u[5] = _mm_srai_epi32(u[5], bit);
 2314|       |
 2315|  7.89k|  x = _mm_mullo_epi32(in[6], cospi38);
 2316|  7.89k|  u[6] = _mm_add_epi32(x, rnding);
 2317|  7.89k|  u[6] = _mm_srai_epi32(u[6], bit);
 2318|       |
 2319|  7.89k|  x = _mm_mullo_epi32(in[6], cospi26);
 2320|  7.89k|  u[7] = _mm_sub_epi32(zero, x);
 2321|  7.89k|  u[7] = _mm_add_epi32(u[7], rnding);
 2322|  7.89k|  u[7] = _mm_srai_epi32(u[7], bit);
 2323|       |
 2324|  7.89k|  u[8] = _mm_mullo_epi32(in[7], cospi34);
 2325|  7.89k|  u[8] = _mm_add_epi32(u[8], rnding);
 2326|  7.89k|  u[8] = _mm_srai_epi32(u[8], bit);
 2327|       |
 2328|  7.89k|  u[9] = _mm_mullo_epi32(in[7], cospi30);
 2329|  7.89k|  u[9] = _mm_add_epi32(u[9], rnding);
 2330|  7.89k|  u[9] = _mm_srai_epi32(u[9], bit);
 2331|       |
 2332|  7.89k|  u[10] = _mm_mullo_epi32(in[5], cospi42);
 2333|  7.89k|  u[10] = _mm_add_epi32(u[10], rnding);
 2334|  7.89k|  u[10] = _mm_srai_epi32(u[10], bit);
 2335|       |
 2336|  7.89k|  u[11] = _mm_mullo_epi32(in[5], cospi22);
 2337|  7.89k|  u[11] = _mm_add_epi32(u[11], rnding);
 2338|  7.89k|  u[11] = _mm_srai_epi32(u[11], bit);
 2339|       |
 2340|  7.89k|  u[12] = _mm_mullo_epi32(in[3], cospi50);
 2341|  7.89k|  u[12] = _mm_add_epi32(u[12], rnding);
 2342|  7.89k|  u[12] = _mm_srai_epi32(u[12], bit);
 2343|       |
 2344|  7.89k|  u[13] = _mm_mullo_epi32(in[3], cospi14);
 2345|  7.89k|  u[13] = _mm_add_epi32(u[13], rnding);
 2346|  7.89k|  u[13] = _mm_srai_epi32(u[13], bit);
 2347|       |
 2348|  7.89k|  u[14] = _mm_mullo_epi32(in[1], cospi58);
 2349|  7.89k|  u[14] = _mm_add_epi32(u[14], rnding);
 2350|  7.89k|  u[14] = _mm_srai_epi32(u[14], bit);
 2351|       |
 2352|  7.89k|  u[15] = _mm_mullo_epi32(in[1], cospi6);
 2353|  7.89k|  u[15] = _mm_add_epi32(u[15], rnding);
 2354|  7.89k|  u[15] = _mm_srai_epi32(u[15], bit);
 2355|       |
 2356|       |  // stage 3
 2357|  7.89k|  addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
 2358|  7.89k|  addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
 2359|  7.89k|  addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
 2360|  7.89k|  addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
 2361|  7.89k|  addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
 2362|  7.89k|  addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
 2363|  7.89k|  addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
 2364|  7.89k|  addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
 2365|       |
 2366|       |  // stage 4
 2367|  7.89k|  y = _mm_mullo_epi32(u[8], cospi56);
 2368|  7.89k|  x = _mm_mullo_epi32(u[9], cospi56);
 2369|  7.89k|  u[8] = _mm_mullo_epi32(u[8], cospi8);
 2370|  7.89k|  u[8] = _mm_add_epi32(u[8], x);
 2371|  7.89k|  u[8] = _mm_add_epi32(u[8], rnding);
 2372|  7.89k|  u[8] = _mm_srai_epi32(u[8], bit);
 2373|       |
 2374|  7.89k|  x = _mm_mullo_epi32(u[9], cospi8);
 2375|  7.89k|  u[9] = _mm_sub_epi32(y, x);
 2376|  7.89k|  u[9] = _mm_add_epi32(u[9], rnding);
 2377|  7.89k|  u[9] = _mm_srai_epi32(u[9], bit);
 2378|       |
 2379|  7.89k|  x = _mm_mullo_epi32(u[11], cospi24);
 2380|  7.89k|  y = _mm_mullo_epi32(u[10], cospi24);
 2381|  7.89k|  u[10] = _mm_mullo_epi32(u[10], cospi40);
 2382|  7.89k|  u[10] = _mm_add_epi32(u[10], x);
 2383|  7.89k|  u[10] = _mm_add_epi32(u[10], rnding);
 2384|  7.89k|  u[10] = _mm_srai_epi32(u[10], bit);
 2385|       |
 2386|  7.89k|  x = _mm_mullo_epi32(u[11], cospi40);
 2387|  7.89k|  u[11] = _mm_sub_epi32(y, x);
 2388|  7.89k|  u[11] = _mm_add_epi32(u[11], rnding);
 2389|  7.89k|  u[11] = _mm_srai_epi32(u[11], bit);
 2390|       |
 2391|  7.89k|  x = _mm_mullo_epi32(u[13], cospi8);
 2392|  7.89k|  y = _mm_mullo_epi32(u[12], cospi8);
 2393|  7.89k|  u[12] = _mm_mullo_epi32(u[12], cospim56);
 2394|  7.89k|  u[12] = _mm_add_epi32(u[12], x);
 2395|  7.89k|  u[12] = _mm_add_epi32(u[12], rnding);
 2396|  7.89k|  u[12] = _mm_srai_epi32(u[12], bit);
 2397|       |
 2398|  7.89k|  x = _mm_mullo_epi32(u[13], cospim56);
 2399|  7.89k|  u[13] = _mm_sub_epi32(y, x);
 2400|  7.89k|  u[13] = _mm_add_epi32(u[13], rnding);
 2401|  7.89k|  u[13] = _mm_srai_epi32(u[13], bit);
 2402|       |
 2403|  7.89k|  x = _mm_mullo_epi32(u[15], cospi40);
 2404|  7.89k|  y = _mm_mullo_epi32(u[14], cospi40);
 2405|  7.89k|  u[14] = _mm_mullo_epi32(u[14], cospim24);
 2406|  7.89k|  u[14] = _mm_add_epi32(u[14], x);
 2407|  7.89k|  u[14] = _mm_add_epi32(u[14], rnding);
 2408|  7.89k|  u[14] = _mm_srai_epi32(u[14], bit);
 2409|       |
 2410|  7.89k|  x = _mm_mullo_epi32(u[15], cospim24);
 2411|  7.89k|  u[15] = _mm_sub_epi32(y, x);
 2412|  7.89k|  u[15] = _mm_add_epi32(u[15], rnding);
 2413|  7.89k|  u[15] = _mm_srai_epi32(u[15], bit);
 2414|       |
 2415|       |  // stage 5
 2416|  7.89k|  addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
 2417|  7.89k|  addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
 2418|  7.89k|  addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
 2419|  7.89k|  addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
 2420|  7.89k|  addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
 2421|  7.89k|  addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
 2422|  7.89k|  addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
 2423|  7.89k|  addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
 2424|       |
 2425|       |  // stage 6
 2426|  7.89k|  x = _mm_mullo_epi32(u[5], cospi48);
 2427|  7.89k|  y = _mm_mullo_epi32(u[4], cospi48);
 2428|  7.89k|  u[4] = _mm_mullo_epi32(u[4], cospi16);
 2429|  7.89k|  u[4] = _mm_add_epi32(u[4], x);
 2430|  7.89k|  u[4] = _mm_add_epi32(u[4], rnding);
 2431|  7.89k|  u[4] = _mm_srai_epi32(u[4], bit);
 2432|       |
 2433|  7.89k|  x = _mm_mullo_epi32(u[5], cospi16);
 2434|  7.89k|  u[5] = _mm_sub_epi32(y, x);
 2435|  7.89k|  u[5] = _mm_add_epi32(u[5], rnding);
 2436|  7.89k|  u[5] = _mm_srai_epi32(u[5], bit);
 2437|       |
 2438|  7.89k|  x = _mm_mullo_epi32(u[7], cospi16);
 2439|  7.89k|  y = _mm_mullo_epi32(u[6], cospi16);
 2440|  7.89k|  u[6] = _mm_mullo_epi32(u[6], cospim48);
 2441|  7.89k|  u[6] = _mm_add_epi32(u[6], x);
 2442|  7.89k|  u[6] = _mm_add_epi32(u[6], rnding);
 2443|  7.89k|  u[6] = _mm_srai_epi32(u[6], bit);
 2444|       |
 2445|  7.89k|  x = _mm_mullo_epi32(u[7], cospim48);
 2446|  7.89k|  u[7] = _mm_sub_epi32(y, x);
 2447|  7.89k|  u[7] = _mm_add_epi32(u[7], rnding);
 2448|  7.89k|  u[7] = _mm_srai_epi32(u[7], bit);
 2449|       |
 2450|  7.89k|  x = _mm_mullo_epi32(u[13], cospi48);
 2451|  7.89k|  y = _mm_mullo_epi32(u[12], cospi48);
 2452|  7.89k|  u[12] = _mm_mullo_epi32(u[12], cospi16);
 2453|  7.89k|  u[12] = _mm_add_epi32(u[12], x);
 2454|  7.89k|  u[12] = _mm_add_epi32(u[12], rnding);
 2455|  7.89k|  u[12] = _mm_srai_epi32(u[12], bit);
 2456|       |
 2457|  7.89k|  x = _mm_mullo_epi32(u[13], cospi16);
 2458|  7.89k|  u[13] = _mm_sub_epi32(y, x);
 2459|  7.89k|  u[13] = _mm_add_epi32(u[13], rnding);
 2460|  7.89k|  u[13] = _mm_srai_epi32(u[13], bit);
 2461|       |
 2462|  7.89k|  x = _mm_mullo_epi32(u[15], cospi16);
 2463|  7.89k|  y = _mm_mullo_epi32(u[14], cospi16);
 2464|  7.89k|  u[14] = _mm_mullo_epi32(u[14], cospim48);
 2465|  7.89k|  u[14] = _mm_add_epi32(u[14], x);
 2466|  7.89k|  u[14] = _mm_add_epi32(u[14], rnding);
 2467|  7.89k|  u[14] = _mm_srai_epi32(u[14], bit);
 2468|       |
 2469|  7.89k|  x = _mm_mullo_epi32(u[15], cospim48);
 2470|  7.89k|  u[15] = _mm_sub_epi32(y, x);
 2471|  7.89k|  u[15] = _mm_add_epi32(u[15], rnding);
 2472|  7.89k|  u[15] = _mm_srai_epi32(u[15], bit);
 2473|       |
 2474|       |  // stage 7
 2475|  7.89k|  addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
 2476|  7.89k|  addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
 2477|  7.89k|  addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
 2478|  7.89k|  addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
 2479|  7.89k|  addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
 2480|  7.89k|  addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
 2481|  7.89k|  addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
 2482|  7.89k|  addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
 2483|       |
 2484|       |  // stage 8
 2485|  7.89k|  y = _mm_mullo_epi32(u[2], cospi32);
 2486|  7.89k|  x = _mm_mullo_epi32(u[3], cospi32);
 2487|  7.89k|  u[2] = _mm_add_epi32(y, x);
 2488|  7.89k|  u[2] = _mm_add_epi32(u[2], rnding);
 2489|  7.89k|  u[2] = _mm_srai_epi32(u[2], bit);
 2490|       |
 2491|  7.89k|  u[3] = _mm_sub_epi32(y, x);
 2492|  7.89k|  u[3] = _mm_add_epi32(u[3], rnding);
 2493|  7.89k|  u[3] = _mm_srai_epi32(u[3], bit);
 2494|  7.89k|  y = _mm_mullo_epi32(u[6], cospi32);
 2495|  7.89k|  x = _mm_mullo_epi32(u[7], cospi32);
 2496|  7.89k|  u[6] = _mm_add_epi32(y, x);
 2497|  7.89k|  u[6] = _mm_add_epi32(u[6], rnding);
 2498|  7.89k|  u[6] = _mm_srai_epi32(u[6], bit);
 2499|       |
 2500|  7.89k|  u[7] = _mm_sub_epi32(y, x);
 2501|  7.89k|  u[7] = _mm_add_epi32(u[7], rnding);
 2502|  7.89k|  u[7] = _mm_srai_epi32(u[7], bit);
 2503|       |
 2504|  7.89k|  y = _mm_mullo_epi32(u[10], cospi32);
 2505|  7.89k|  x = _mm_mullo_epi32(u[11], cospi32);
 2506|  7.89k|  u[10] = _mm_add_epi32(y, x);
 2507|  7.89k|  u[10] = _mm_add_epi32(u[10], rnding);
 2508|  7.89k|  u[10] = _mm_srai_epi32(u[10], bit);
 2509|       |
 2510|  7.89k|  u[11] = _mm_sub_epi32(y, x);
 2511|  7.89k|  u[11] = _mm_add_epi32(u[11], rnding);
 2512|  7.89k|  u[11] = _mm_srai_epi32(u[11], bit);
 2513|       |
 2514|  7.89k|  y = _mm_mullo_epi32(u[14], cospi32);
 2515|  7.89k|  x = _mm_mullo_epi32(u[15], cospi32);
 2516|  7.89k|  u[14] = _mm_add_epi32(y, x);
 2517|  7.89k|  u[14] = _mm_add_epi32(u[14], rnding);
 2518|  7.89k|  u[14] = _mm_srai_epi32(u[14], bit);
 2519|       |
 2520|  7.89k|  u[15] = _mm_sub_epi32(y, x);
 2521|  7.89k|  u[15] = _mm_add_epi32(u[15], rnding);
 2522|  7.89k|  u[15] = _mm_srai_epi32(u[15], bit);
 2523|       |
 2524|       |  // stage 9
 2525|  7.89k|  if (do_cols) {
  ------------------
  |  Branch (2525:7): [True: 1.62k, False: 6.27k]
  ------------------
 2526|  1.62k|    out[0] = u[0];
 2527|  1.62k|    out[1] = _mm_sub_epi32(zero, u[8]);
 2528|  1.62k|    out[2] = u[12];
 2529|  1.62k|    out[3] = _mm_sub_epi32(zero, u[4]);
 2530|  1.62k|    out[4] = u[6];
 2531|  1.62k|    out[5] = _mm_sub_epi32(zero, u[14]);
 2532|  1.62k|    out[6] = u[10];
 2533|  1.62k|    out[7] = _mm_sub_epi32(zero, u[2]);
 2534|  1.62k|    out[8] = u[3];
 2535|  1.62k|    out[9] = _mm_sub_epi32(zero, u[11]);
 2536|  1.62k|    out[10] = u[15];
 2537|  1.62k|    out[11] = _mm_sub_epi32(zero, u[7]);
 2538|  1.62k|    out[12] = u[5];
 2539|  1.62k|    out[13] = _mm_sub_epi32(zero, u[13]);
 2540|  1.62k|    out[14] = u[9];
 2541|  1.62k|    out[15] = _mm_sub_epi32(zero, u[1]);
 2542|  6.27k|  } else {
 2543|  6.27k|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  6.27k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 6.27k]
  |  |  ------------------
  ------------------
 2544|  6.27k|    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
 2545|  6.27k|    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
 2546|       |
 2547|  6.27k|    neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
 2548|  6.27k|                     out_shift);
 2549|  6.27k|    neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
 2550|  6.27k|                     &clamp_hi_out, out_shift);
 2551|  6.27k|    neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
 2552|  6.27k|                     &clamp_hi_out, out_shift);
 2553|  6.27k|    neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
 2554|  6.27k|                     &clamp_hi_out, out_shift);
 2555|  6.27k|    neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
 2556|  6.27k|                     &clamp_hi_out, out_shift);
 2557|  6.27k|    neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
 2558|  6.27k|                     &clamp_hi_out, out_shift);
 2559|  6.27k|    neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
 2560|  6.27k|                     &clamp_hi_out, out_shift);
 2561|  6.27k|    neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
 2562|  6.27k|                     &clamp_hi_out, out_shift);
 2563|  6.27k|  }
 2564|  7.89k|}
highbd_inv_txfm_sse4.c:iadst16x16_sse4_1:
 2749|   421k|                              int bd, int out_shift) {
 2750|   421k|  const int32_t *cospi = cospi_arr(bit);
 2751|   421k|  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
 2752|   421k|  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
 2753|   421k|  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
 2754|   421k|  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
 2755|   421k|  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
 2756|   421k|  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
 2757|   421k|  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
 2758|   421k|  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
 2759|   421k|  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
 2760|   421k|  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
 2761|   421k|  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
 2762|   421k|  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
 2763|   421k|  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
 2764|   421k|  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
 2765|   421k|  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
 2766|   421k|  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
 2767|   421k|  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
 2768|   421k|  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
 2769|   421k|  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
 2770|   421k|  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
 2771|   421k|  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
 2772|   421k|  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
 2773|   421k|  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
 2774|   421k|  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
 2775|   421k|  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
 2776|   421k|  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
 2777|   421k|  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
 2778|   421k|  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
  ------------------
  |  |   35|   843k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 421k]
  |  |  |  Branch (35:31): [True: 150k, False: 271k]
  |  |  |  Branch (35:44): [True: 150k, False: 271k]
  |  |  ------------------
  ------------------
 2779|   421k|  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
 2780|   421k|  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
 2781|   421k|  const __m128i zero = _mm_setzero_si128();
 2782|   421k|  __m128i u[16], v[16], x, y;
 2783|       |  // Calculate the column 0, 1, 2, 3
 2784|       |  // stage 0
 2785|       |  // stage 1
 2786|       |  // stage 2
 2787|   421k|  v[0] = _mm_mullo_epi32(in[15], cospi2);
 2788|   421k|  x = _mm_mullo_epi32(in[0], cospi62);
 2789|   421k|  v[0] = _mm_add_epi32(v[0], x);
 2790|   421k|  v[0] = _mm_add_epi32(v[0], rnding);
 2791|   421k|  v[0] = _mm_srai_epi32(v[0], bit);
 2792|       |
 2793|   421k|  v[1] = _mm_mullo_epi32(in[15], cospi62);
 2794|   421k|  x = _mm_mullo_epi32(in[0], cospi2);
 2795|   421k|  v[1] = _mm_sub_epi32(v[1], x);
 2796|   421k|  v[1] = _mm_add_epi32(v[1], rnding);
 2797|   421k|  v[1] = _mm_srai_epi32(v[1], bit);
 2798|       |
 2799|   421k|  v[2] = _mm_mullo_epi32(in[13], cospi10);
 2800|   421k|  x = _mm_mullo_epi32(in[2], cospi54);
 2801|   421k|  v[2] = _mm_add_epi32(v[2], x);
 2802|   421k|  v[2] = _mm_add_epi32(v[2], rnding);
 2803|   421k|  v[2] = _mm_srai_epi32(v[2], bit);
 2804|       |
 2805|   421k|  v[3] = _mm_mullo_epi32(in[13], cospi54);
 2806|   421k|  x = _mm_mullo_epi32(in[2], cospi10);
 2807|   421k|  v[3] = _mm_sub_epi32(v[3], x);
 2808|   421k|  v[3] = _mm_add_epi32(v[3], rnding);
 2809|   421k|  v[3] = _mm_srai_epi32(v[3], bit);
 2810|       |
 2811|   421k|  v[4] = _mm_mullo_epi32(in[11], cospi18);
 2812|   421k|  x = _mm_mullo_epi32(in[4], cospi46);
 2813|   421k|  v[4] = _mm_add_epi32(v[4], x);
 2814|   421k|  v[4] = _mm_add_epi32(v[4], rnding);
 2815|   421k|  v[4] = _mm_srai_epi32(v[4], bit);
 2816|       |
 2817|   421k|  v[5] = _mm_mullo_epi32(in[11], cospi46);
 2818|   421k|  x = _mm_mullo_epi32(in[4], cospi18);
 2819|   421k|  v[5] = _mm_sub_epi32(v[5], x);
 2820|   421k|  v[5] = _mm_add_epi32(v[5], rnding);
 2821|   421k|  v[5] = _mm_srai_epi32(v[5], bit);
 2822|       |
 2823|   421k|  v[6] = _mm_mullo_epi32(in[9], cospi26);
 2824|   421k|  x = _mm_mullo_epi32(in[6], cospi38);
 2825|   421k|  v[6] = _mm_add_epi32(v[6], x);
 2826|   421k|  v[6] = _mm_add_epi32(v[6], rnding);
 2827|   421k|  v[6] = _mm_srai_epi32(v[6], bit);
 2828|       |
 2829|   421k|  v[7] = _mm_mullo_epi32(in[9], cospi38);
 2830|   421k|  x = _mm_mullo_epi32(in[6], cospi26);
 2831|   421k|  v[7] = _mm_sub_epi32(v[7], x);
 2832|   421k|  v[7] = _mm_add_epi32(v[7], rnding);
 2833|   421k|  v[7] = _mm_srai_epi32(v[7], bit);
 2834|       |
 2835|   421k|  v[8] = _mm_mullo_epi32(in[7], cospi34);
 2836|   421k|  x = _mm_mullo_epi32(in[8], cospi30);
 2837|   421k|  v[8] = _mm_add_epi32(v[8], x);
 2838|   421k|  v[8] = _mm_add_epi32(v[8], rnding);
 2839|   421k|  v[8] = _mm_srai_epi32(v[8], bit);
 2840|       |
 2841|   421k|  v[9] = _mm_mullo_epi32(in[7], cospi30);
 2842|   421k|  x = _mm_mullo_epi32(in[8], cospi34);
 2843|   421k|  v[9] = _mm_sub_epi32(v[9], x);
 2844|   421k|  v[9] = _mm_add_epi32(v[9], rnding);
 2845|   421k|  v[9] = _mm_srai_epi32(v[9], bit);
 2846|       |
 2847|   421k|  v[10] = _mm_mullo_epi32(in[5], cospi42);
 2848|   421k|  x = _mm_mullo_epi32(in[10], cospi22);
 2849|   421k|  v[10] = _mm_add_epi32(v[10], x);
 2850|   421k|  v[10] = _mm_add_epi32(v[10], rnding);
 2851|   421k|  v[10] = _mm_srai_epi32(v[10], bit);
 2852|       |
 2853|   421k|  v[11] = _mm_mullo_epi32(in[5], cospi22);
 2854|   421k|  x = _mm_mullo_epi32(in[10], cospi42);
 2855|   421k|  v[11] = _mm_sub_epi32(v[11], x);
 2856|   421k|  v[11] = _mm_add_epi32(v[11], rnding);
 2857|   421k|  v[11] = _mm_srai_epi32(v[11], bit);
 2858|       |
 2859|   421k|  v[12] = _mm_mullo_epi32(in[3], cospi50);
 2860|   421k|  x = _mm_mullo_epi32(in[12], cospi14);
 2861|   421k|  v[12] = _mm_add_epi32(v[12], x);
 2862|   421k|  v[12] = _mm_add_epi32(v[12], rnding);
 2863|   421k|  v[12] = _mm_srai_epi32(v[12], bit);
 2864|       |
 2865|   421k|  v[13] = _mm_mullo_epi32(in[3], cospi14);
 2866|   421k|  x = _mm_mullo_epi32(in[12], cospi50);
 2867|   421k|  v[13] = _mm_sub_epi32(v[13], x);
 2868|   421k|  v[13] = _mm_add_epi32(v[13], rnding);
 2869|   421k|  v[13] = _mm_srai_epi32(v[13], bit);
 2870|       |
 2871|   421k|  v[14] = _mm_mullo_epi32(in[1], cospi58);
 2872|   421k|  x = _mm_mullo_epi32(in[14], cospi6);
 2873|   421k|  v[14] = _mm_add_epi32(v[14], x);
 2874|   421k|  v[14] = _mm_add_epi32(v[14], rnding);
 2875|   421k|  v[14] = _mm_srai_epi32(v[14], bit);
 2876|       |
 2877|   421k|  v[15] = _mm_mullo_epi32(in[1], cospi6);
 2878|   421k|  x = _mm_mullo_epi32(in[14], cospi58);
 2879|   421k|  v[15] = _mm_sub_epi32(v[15], x);
 2880|   421k|  v[15] = _mm_add_epi32(v[15], rnding);
 2881|   421k|  v[15] = _mm_srai_epi32(v[15], bit);
 2882|       |
 2883|       |  // stage 3
 2884|   421k|  addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
 2885|   421k|  addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
 2886|   421k|  addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
 2887|   421k|  addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
 2888|   421k|  addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
 2889|   421k|  addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
 2890|   421k|  addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
 2891|   421k|  addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
 2892|       |
 2893|       |  // stage 4
 2894|   421k|  v[0] = u[0];
 2895|   421k|  v[1] = u[1];
 2896|   421k|  v[2] = u[2];
 2897|   421k|  v[3] = u[3];
 2898|   421k|  v[4] = u[4];
 2899|   421k|  v[5] = u[5];
 2900|   421k|  v[6] = u[6];
 2901|   421k|  v[7] = u[7];
 2902|       |
 2903|   421k|  v[8] = _mm_mullo_epi32(u[8], cospi8);
 2904|   421k|  x = _mm_mullo_epi32(u[9], cospi56);
 2905|   421k|  v[8] = _mm_add_epi32(v[8], x);
 2906|   421k|  v[8] = _mm_add_epi32(v[8], rnding);
 2907|   421k|  v[8] = _mm_srai_epi32(v[8], bit);
 2908|       |
 2909|   421k|  v[9] = _mm_mullo_epi32(u[8], cospi56);
 2910|   421k|  x = _mm_mullo_epi32(u[9], cospi8);
 2911|   421k|  v[9] = _mm_sub_epi32(v[9], x);
 2912|   421k|  v[9] = _mm_add_epi32(v[9], rnding);
 2913|   421k|  v[9] = _mm_srai_epi32(v[9], bit);
 2914|       |
 2915|   421k|  v[10] = _mm_mullo_epi32(u[10], cospi40);
 2916|   421k|  x = _mm_mullo_epi32(u[11], cospi24);
 2917|   421k|  v[10] = _mm_add_epi32(v[10], x);
 2918|   421k|  v[10] = _mm_add_epi32(v[10], rnding);
 2919|   421k|  v[10] = _mm_srai_epi32(v[10], bit);
 2920|       |
 2921|   421k|  v[11] = _mm_mullo_epi32(u[10], cospi24);
 2922|   421k|  x = _mm_mullo_epi32(u[11], cospi40);
 2923|   421k|  v[11] = _mm_sub_epi32(v[11], x);
 2924|   421k|  v[11] = _mm_add_epi32(v[11], rnding);
 2925|   421k|  v[11] = _mm_srai_epi32(v[11], bit);
 2926|       |
 2927|   421k|  v[12] = _mm_mullo_epi32(u[12], cospim56);
 2928|   421k|  x = _mm_mullo_epi32(u[13], cospi8);
 2929|   421k|  v[12] = _mm_add_epi32(v[12], x);
 2930|   421k|  v[12] = _mm_add_epi32(v[12], rnding);
 2931|   421k|  v[12] = _mm_srai_epi32(v[12], bit);
 2932|       |
 2933|   421k|  v[13] = _mm_mullo_epi32(u[12], cospi8);
 2934|   421k|  x = _mm_mullo_epi32(u[13], cospim56);
 2935|   421k|  v[13] = _mm_sub_epi32(v[13], x);
 2936|   421k|  v[13] = _mm_add_epi32(v[13], rnding);
 2937|   421k|  v[13] = _mm_srai_epi32(v[13], bit);
 2938|       |
 2939|   421k|  v[14] = _mm_mullo_epi32(u[14], cospim24);
 2940|   421k|  x = _mm_mullo_epi32(u[15], cospi40);
 2941|   421k|  v[14] = _mm_add_epi32(v[14], x);
 2942|   421k|  v[14] = _mm_add_epi32(v[14], rnding);
 2943|   421k|  v[14] = _mm_srai_epi32(v[14], bit);
 2944|       |
 2945|   421k|  v[15] = _mm_mullo_epi32(u[14], cospi40);
 2946|   421k|  x = _mm_mullo_epi32(u[15], cospim24);
 2947|   421k|  v[15] = _mm_sub_epi32(v[15], x);
 2948|   421k|  v[15] = _mm_add_epi32(v[15], rnding);
 2949|   421k|  v[15] = _mm_srai_epi32(v[15], bit);
 2950|       |
 2951|       |  // stage 5
 2952|   421k|  addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
 2953|   421k|  addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
 2954|   421k|  addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
 2955|   421k|  addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
 2956|   421k|  addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
 2957|   421k|  addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
 2958|   421k|  addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
 2959|   421k|  addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
 2960|       |
 2961|       |  // stage 6
 2962|   421k|  v[0] = u[0];
 2963|   421k|  v[1] = u[1];
 2964|   421k|  v[2] = u[2];
 2965|   421k|  v[3] = u[3];
 2966|       |
 2967|   421k|  v[4] = _mm_mullo_epi32(u[4], cospi16);
 2968|   421k|  x = _mm_mullo_epi32(u[5], cospi48);
 2969|   421k|  v[4] = _mm_add_epi32(v[4], x);
 2970|   421k|  v[4] = _mm_add_epi32(v[4], rnding);
 2971|   421k|  v[4] = _mm_srai_epi32(v[4], bit);
 2972|       |
 2973|   421k|  v[5] = _mm_mullo_epi32(u[4], cospi48);
 2974|   421k|  x = _mm_mullo_epi32(u[5], cospi16);
 2975|   421k|  v[5] = _mm_sub_epi32(v[5], x);
 2976|   421k|  v[5] = _mm_add_epi32(v[5], rnding);
 2977|   421k|  v[5] = _mm_srai_epi32(v[5], bit);
 2978|       |
 2979|   421k|  v[6] = _mm_mullo_epi32(u[6], cospim48);
 2980|   421k|  x = _mm_mullo_epi32(u[7], cospi16);
 2981|   421k|  v[6] = _mm_add_epi32(v[6], x);
 2982|   421k|  v[6] = _mm_add_epi32(v[6], rnding);
 2983|   421k|  v[6] = _mm_srai_epi32(v[6], bit);
 2984|       |
 2985|   421k|  v[7] = _mm_mullo_epi32(u[6], cospi16);
 2986|   421k|  x = _mm_mullo_epi32(u[7], cospim48);
 2987|   421k|  v[7] = _mm_sub_epi32(v[7], x);
 2988|   421k|  v[7] = _mm_add_epi32(v[7], rnding);
 2989|   421k|  v[7] = _mm_srai_epi32(v[7], bit);
 2990|       |
 2991|   421k|  v[8] = u[8];
 2992|   421k|  v[9] = u[9];
 2993|   421k|  v[10] = u[10];
 2994|   421k|  v[11] = u[11];
 2995|       |
 2996|   421k|  v[12] = _mm_mullo_epi32(u[12], cospi16);
 2997|   421k|  x = _mm_mullo_epi32(u[13], cospi48);
 2998|   421k|  v[12] = _mm_add_epi32(v[12], x);
 2999|   421k|  v[12] = _mm_add_epi32(v[12], rnding);
 3000|   421k|  v[12] = _mm_srai_epi32(v[12], bit);
 3001|       |
 3002|   421k|  v[13] = _mm_mullo_epi32(u[12], cospi48);
 3003|   421k|  x = _mm_mullo_epi32(u[13], cospi16);
 3004|   421k|  v[13] = _mm_sub_epi32(v[13], x);
 3005|   421k|  v[13] = _mm_add_epi32(v[13], rnding);
 3006|   421k|  v[13] = _mm_srai_epi32(v[13], bit);
 3007|       |
 3008|   421k|  v[14] = _mm_mullo_epi32(u[14], cospim48);
 3009|   421k|  x = _mm_mullo_epi32(u[15], cospi16);
 3010|   421k|  v[14] = _mm_add_epi32(v[14], x);
 3011|   421k|  v[14] = _mm_add_epi32(v[14], rnding);
 3012|   421k|  v[14] = _mm_srai_epi32(v[14], bit);
 3013|       |
 3014|   421k|  v[15] = _mm_mullo_epi32(u[14], cospi16);
 3015|   421k|  x = _mm_mullo_epi32(u[15], cospim48);
 3016|   421k|  v[15] = _mm_sub_epi32(v[15], x);
 3017|   421k|  v[15] = _mm_add_epi32(v[15], rnding);
 3018|   421k|  v[15] = _mm_srai_epi32(v[15], bit);
 3019|       |
 3020|       |  // stage 7
 3021|   421k|  addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
 3022|   421k|  addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
 3023|   421k|  addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
 3024|   421k|  addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
 3025|   421k|  addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
 3026|   421k|  addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
 3027|   421k|  addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
 3028|   421k|  addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
 3029|       |
 3030|       |  // stage 8
 3031|   421k|  v[0] = u[0];
 3032|   421k|  v[1] = u[1];
 3033|       |
 3034|   421k|  y = _mm_mullo_epi32(u[2], cospi32);
 3035|   421k|  x = _mm_mullo_epi32(u[3], cospi32);
 3036|   421k|  v[2] = _mm_add_epi32(y, x);
 3037|   421k|  v[2] = _mm_add_epi32(v[2], rnding);
 3038|   421k|  v[2] = _mm_srai_epi32(v[2], bit);
 3039|       |
 3040|   421k|  v[3] = _mm_sub_epi32(y, x);
 3041|   421k|  v[3] = _mm_add_epi32(v[3], rnding);
 3042|   421k|  v[3] = _mm_srai_epi32(v[3], bit);
 3043|       |
 3044|   421k|  v[4] = u[4];
 3045|   421k|  v[5] = u[5];
 3046|       |
 3047|   421k|  y = _mm_mullo_epi32(u[6], cospi32);
 3048|   421k|  x = _mm_mullo_epi32(u[7], cospi32);
 3049|   421k|  v[6] = _mm_add_epi32(y, x);
 3050|   421k|  v[6] = _mm_add_epi32(v[6], rnding);
 3051|   421k|  v[6] = _mm_srai_epi32(v[6], bit);
 3052|       |
 3053|   421k|  v[7] = _mm_sub_epi32(y, x);
 3054|   421k|  v[7] = _mm_add_epi32(v[7], rnding);
 3055|   421k|  v[7] = _mm_srai_epi32(v[7], bit);
 3056|       |
 3057|   421k|  v[8] = u[8];
 3058|   421k|  v[9] = u[9];
 3059|       |
 3060|   421k|  y = _mm_mullo_epi32(u[10], cospi32);
 3061|   421k|  x = _mm_mullo_epi32(u[11], cospi32);
 3062|   421k|  v[10] = _mm_add_epi32(y, x);
 3063|   421k|  v[10] = _mm_add_epi32(v[10], rnding);
 3064|   421k|  v[10] = _mm_srai_epi32(v[10], bit);
 3065|       |
 3066|   421k|  v[11] = _mm_sub_epi32(y, x);
 3067|   421k|  v[11] = _mm_add_epi32(v[11], rnding);
 3068|   421k|  v[11] = _mm_srai_epi32(v[11], bit);
 3069|       |
 3070|   421k|  v[12] = u[12];
 3071|   421k|  v[13] = u[13];
 3072|       |
 3073|   421k|  y = _mm_mullo_epi32(u[14], cospi32);
 3074|   421k|  x = _mm_mullo_epi32(u[15], cospi32);
 3075|   421k|  v[14] = _mm_add_epi32(y, x);
 3076|   421k|  v[14] = _mm_add_epi32(v[14], rnding);
 3077|   421k|  v[14] = _mm_srai_epi32(v[14], bit);
 3078|       |
 3079|   421k|  v[15] = _mm_sub_epi32(y, x);
 3080|   421k|  v[15] = _mm_add_epi32(v[15], rnding);
 3081|   421k|  v[15] = _mm_srai_epi32(v[15], bit);
 3082|       |
 3083|       |  // stage 9
 3084|   421k|  if (do_cols) {
  ------------------
  |  Branch (3084:7): [True: 150k, False: 271k]
  ------------------
 3085|   150k|    out[0] = v[0];
 3086|   150k|    out[1] = _mm_sub_epi32(zero, v[8]);
 3087|   150k|    out[2] = v[12];
 3088|   150k|    out[3] = _mm_sub_epi32(zero, v[4]);
 3089|   150k|    out[4] = v[6];
 3090|   150k|    out[5] = _mm_sub_epi32(zero, v[14]);
 3091|   150k|    out[6] = v[10];
 3092|   150k|    out[7] = _mm_sub_epi32(zero, v[2]);
 3093|   150k|    out[8] = v[3];
 3094|   150k|    out[9] = _mm_sub_epi32(zero, v[11]);
 3095|   150k|    out[10] = v[15];
 3096|   150k|    out[11] = _mm_sub_epi32(zero, v[7]);
 3097|   150k|    out[12] = v[5];
 3098|   150k|    out[13] = _mm_sub_epi32(zero, v[13]);
 3099|   150k|    out[14] = v[9];
 3100|   150k|    out[15] = _mm_sub_epi32(zero, v[1]);
 3101|   271k|  } else {
 3102|   271k|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|   271k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 271k]
  |  |  ------------------
  ------------------
 3103|   271k|    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
 3104|   271k|    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
 3105|       |
 3106|   271k|    neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
 3107|   271k|                     out_shift);
 3108|   271k|    neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
 3109|   271k|                     &clamp_hi_out, out_shift);
 3110|   271k|    neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
 3111|   271k|                     &clamp_hi_out, out_shift);
 3112|   271k|    neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
 3113|   271k|                     &clamp_hi_out, out_shift);
 3114|   271k|    neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
 3115|   271k|                     &clamp_hi_out, out_shift);
 3116|   271k|    neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
 3117|   271k|                     &clamp_hi_out, out_shift);
 3118|   271k|    neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
 3119|   271k|                     &clamp_hi_out, out_shift);
 3120|   271k|    neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
 3121|   271k|                     &clamp_hi_out, out_shift);
 3122|   271k|  }
 3123|   421k|}
highbd_inv_txfm_sse4.c:iidentity16_sse4_1:
 3125|  1.01M|                               int bd, int out_shift) {
 3126|  1.01M|  (void)bit;
 3127|  1.01M|  __m128i fact = _mm_set1_epi32(2 * NewSqrt2);
 3128|  1.01M|  __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
  ------------------
  |  |   41|  1.01M|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 3129|  1.01M|  __m128i a0_low, a0_high, a1_low, a1_high;
 3130|  1.01M|  __m128i zero = _mm_setzero_si128();
 3131|  1.01M|  offset = _mm_unpacklo_epi32(offset, zero);
 3132|       |
 3133|  17.2M|  for (int i = 0; i < 16; i++) {
  ------------------
  |  Branch (3133:19): [True: 16.2M, False: 1.01M]
  ------------------
 3134|  16.2M|    a0_low = _mm_mul_epi32(in[i], fact);
 3135|  16.2M|    a0_low = _mm_add_epi32(a0_low, offset);
 3136|  16.2M|    a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits);
  ------------------
  |  |   41|  16.2M|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 3137|       |
 3138|  16.2M|    a0_high = _mm_srli_si128(in[i], 4);
 3139|  16.2M|    a0_high = _mm_mul_epi32(a0_high, fact);
 3140|  16.2M|    a0_high = _mm_add_epi32(a0_high, offset);
 3141|  16.2M|    a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits);
  ------------------
  |  |   41|  16.2M|#define NewSqrt2Bits ((int32_t)12)
  ------------------
 3142|       |
 3143|  16.2M|    a1_low = _mm_unpacklo_epi32(a0_low, a0_high);
 3144|  16.2M|    a1_high = _mm_unpackhi_epi32(a0_low, a0_high);
 3145|  16.2M|    out[i] = _mm_unpacklo_epi64(a1_low, a1_high);
 3146|  16.2M|  }
 3147|       |
 3148|  1.01M|  if (!do_cols) {
  ------------------
  |  Branch (3148:7): [True: 468k, False: 546k]
  ------------------
 3149|   468k|    const int log_range = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|   468k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 468k]
  |  |  ------------------
  ------------------
 3150|   468k|    const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
 3151|   468k|    const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
 3152|   468k|    round_shift_8x8(out, out_shift);
 3153|   468k|    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 16);
 3154|   468k|  }
 3155|  1.01M|}
highbd_inv_txfm_sse4.c:iidentity32_sse4_1:
 5170|   138k|                               int bd, int out_shift) {
 5171|   138k|  (void)bit;
 5172|   415k|  for (int i = 0; i < 32; i += 16) {
  ------------------
  |  Branch (5172:19): [True: 276k, False: 138k]
  ------------------
 5173|   276k|    out[i] = _mm_slli_epi32(in[i], 2);
 5174|   276k|    out[i + 1] = _mm_slli_epi32(in[i + 1], 2);
 5175|   276k|    out[i + 2] = _mm_slli_epi32(in[i + 2], 2);
 5176|   276k|    out[i + 3] = _mm_slli_epi32(in[i + 3], 2);
 5177|   276k|    out[i + 4] = _mm_slli_epi32(in[i + 4], 2);
 5178|   276k|    out[i + 5] = _mm_slli_epi32(in[i + 5], 2);
 5179|   276k|    out[i + 6] = _mm_slli_epi32(in[i + 6], 2);
 5180|   276k|    out[i + 7] = _mm_slli_epi32(in[i + 7], 2);
 5181|   276k|    out[i + 8] = _mm_slli_epi32(in[i + 8], 2);
 5182|   276k|    out[i + 9] = _mm_slli_epi32(in[i + 9], 2);
 5183|   276k|    out[i + 10] = _mm_slli_epi32(in[i + 10], 2);
 5184|   276k|    out[i + 11] = _mm_slli_epi32(in[i + 11], 2);
 5185|   276k|    out[i + 12] = _mm_slli_epi32(in[i + 12], 2);
 5186|   276k|    out[i + 13] = _mm_slli_epi32(in[i + 13], 2);
 5187|   276k|    out[i + 14] = _mm_slli_epi32(in[i + 14], 2);
 5188|   276k|    out[i + 15] = _mm_slli_epi32(in[i + 15], 2);
 5189|   276k|  }
 5190|       |
 5191|   138k|  if (!do_cols) {
  ------------------
  |  Branch (5191:7): [True: 73.2k, False: 65.2k]
  ------------------
 5192|  73.2k|    const int log_range_out = AOMMAX(16, bd + 6);
  ------------------
  |  |   35|  73.2k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 0, False: 73.2k]
  |  |  ------------------
  ------------------
 5193|  73.2k|    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
 5194|  73.2k|    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
 5195|  73.2k|    round_shift_8x8(out, out_shift);
 5196|  73.2k|    round_shift_8x8(out + 16, out_shift);
 5197|  73.2k|    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32);
 5198|  73.2k|  }
 5199|   138k|}
highbd_inv_txfm_sse4.c:load_buffer_32bit_input:
  135|  5.93M|                                           __m128i *out, int out_size) {
  136|  54.0M|  for (int i = 0; i < out_size; ++i) {
  ------------------
  |  Branch (136:19): [True: 48.0M, False: 5.93M]
  ------------------
  137|  48.0M|    out[i] = _mm_loadu_si128((const __m128i *)(in + i * stride));
  138|  48.0M|  }
  139|  5.93M|}
highbd_inv_txfm_sse4.c:highbd_write_buffer_8xn_sse4_1:
  123|  3.16M|                                                  int height, const int bd) {
  124|  3.16M|  int j = flipud ? (height - 1) : 0;
  ------------------
  |  Branch (124:11): [True: 150k, False: 3.01M]
  ------------------
  125|  3.16M|  const int step = flipud ? -1 : 1;
  ------------------
  |  Branch (125:20): [True: 150k, False: 3.01M]
  ------------------
  126|  22.0M|  for (int i = 0; i < height; ++i, j += step) {
  ------------------
  |  Branch (126:19): [True: 18.8M, False: 3.16M]
  ------------------
  127|  18.8M|    __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
  128|  18.8M|    __m128i u = highbd_get_recon_8x8_sse4_1(v, in[j], in[j + height], bd);
  129|       |
  130|  18.8M|    _mm_storeu_si128((__m128i *)(output + i * stride), u);
  131|  18.8M|  }
  132|  3.16M|}
highbd_inv_txfm_sse4.c:highbd_get_recon_8x8_sse4_1:
   83|  18.8M|                                                  const int bd) {
   84|  18.8M|  __m128i x0 = _mm_cvtepi16_epi32(pred);
   85|  18.8M|  __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8));
   86|  18.8M|  __m128i min_clip_val = _mm_setzero_si128();
   87|  18.8M|  __m128i max_clip_val = _mm_set1_epi32((1 << bd) - 1);
   88|  18.8M|  x0 = _mm_add_epi32(res0, x0);
   89|  18.8M|  x1 = _mm_add_epi32(res1, x1);
   90|  18.8M|  x0 = _mm_max_epi32(x0, min_clip_val);
   91|  18.8M|  x0 = _mm_min_epi32(x0, max_clip_val);
   92|  18.8M|  x1 = _mm_max_epi32(x1, min_clip_val);
   93|  18.8M|  x1 = _mm_min_epi32(x1, max_clip_val);
   94|  18.8M|  x0 = _mm_packus_epi32(x0, x1);
   95|  18.8M|  return x0;
   96|  18.8M|}
highbd_inv_txfm_sse4.c:highbd_inv_txfm2d_add_h_identity_ssse41:
 5230|  58.7k|                                                    const int bd) {
 5231|  58.7k|  __m128i buf1[64];
 5232|  58.7k|  int eobx, eoby;
 5233|  58.7k|  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
 5234|  58.7k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 5235|  58.7k|  const int txw_idx = get_txw_idx(tx_size);
 5236|  58.7k|  const int txh_idx = get_txh_idx(tx_size);
 5237|  58.7k|  const int txfm_size_col = tx_size_wide[tx_size];
 5238|  58.7k|  const int txfm_size_row = tx_size_high[tx_size];
 5239|  58.7k|  const int buf_size_w = AOMMIN(32, txfm_size_col);
  ------------------
  |  |   34|  58.7k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 58.7k]
  |  |  ------------------
  ------------------
 5240|  58.7k|  const int buf_size_w_div4 = buf_size_w >> 2;
 5241|  58.7k|  const int buf_size_h_div8 = (eoby + 8) >> 3;
 5242|  58.7k|  const int row_max = AOMMIN(32, txfm_size_row);
  ------------------
  |  |   34|  58.7k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 58.7k]
  |  |  ------------------
  ------------------
 5243|  58.7k|  const int input_stride = row_max;
 5244|  58.7k|  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
 5245|  58.7k|  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
 5246|  58.7k|  const transform_1d_sse4_1 row_txfm =
 5247|  58.7k|      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
 5248|  58.7k|  const transform_1d_sse4_1 col_txfm =
 5249|  58.7k|      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
 5250|  58.7k|  int ud_flip, lr_flip;
 5251|  58.7k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 5252|       |
 5253|   192k|  for (int i = 0; i < (buf_size_h_div8 << 1); ++i) {
  ------------------
  |  Branch (5253:19): [True: 133k, False: 58.7k]
  ------------------
 5254|   133k|    __m128i buf0[16];
 5255|   133k|    load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w);
 5256|   133k|    if (rect_type == 1 || rect_type == -1) {
  ------------------
  |  Branch (5256:9): [True: 36.0k, False: 97.2k]
  |  Branch (5256:27): [True: 28.1k, False: 69.0k]
  ------------------
 5257|  64.1k|      av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_w, 0,
 5258|  64.1k|                                           NewInvSqrt2);
 5259|  64.1k|    }
 5260|   133k|    row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
  ------------------
  |  |   43|   133k|#define INV_COS_BIT 12
  ------------------
 5261|       |
 5262|   133k|    __m128i *_buf1 = buf1 + i * 4;
 5263|       |
 5264|   493k|    for (int j = 0; j < buf_size_w_div4; ++j) {
  ------------------
  |  Branch (5264:21): [True: 359k, False: 133k]
  ------------------
 5265|   359k|      __m128i *buf0_cur = buf0 + j * 4;
 5266|   359k|      TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
  ------------------
  |  |   18|   359k|  do {                                                \
  |  |   19|   359k|    __m128i u0, u1, u2, u3;                           \
  |  |   20|   359k|    u0 = _mm_unpacklo_epi32(x0, x1);                  \
  |  |   21|   359k|    u1 = _mm_unpackhi_epi32(x0, x1);                  \
  |  |   22|   359k|    u2 = _mm_unpacklo_epi32(x2, x3);                  \
  |  |   23|   359k|    u3 = _mm_unpackhi_epi32(x2, x3);                  \
  |  |   24|   359k|    y0 = _mm_unpacklo_epi64(u0, u2);                  \
  |  |   25|   359k|    y1 = _mm_unpackhi_epi64(u0, u2);                  \
  |  |   26|   359k|    y2 = _mm_unpacklo_epi64(u1, u3);                  \
  |  |   27|   359k|    y3 = _mm_unpackhi_epi64(u1, u3);                  \
  |  |   28|   359k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (28:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 5267|   359k|                    buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
 5268|   359k|      _buf1[j * txfm_size_row + 0] = buf0_cur[0];
 5269|   359k|      _buf1[j * txfm_size_row + 1] = buf0_cur[1];
 5270|   359k|      _buf1[j * txfm_size_row + 2] = buf0_cur[2];
 5271|   359k|      _buf1[j * txfm_size_row + 3] = buf0_cur[3];
 5272|   359k|    }
 5273|   133k|  }
 5274|   219k|  for (int i = 0; i < buf_size_w_div4; i++) {
  ------------------
  |  Branch (5274:19): [True: 160k, False: 58.7k]
  ------------------
 5275|   160k|    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
  ------------------
  |  |   43|   160k|#define INV_COS_BIT 12
  ------------------
 5276|   160k|             bd, 0);
 5277|       |
 5278|   160k|    av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
 5279|   160k|                                    buf1 + i * txfm_size_row, txfm_size_row,
 5280|   160k|                                    -shift[1]);
 5281|   160k|  }
 5282|       |
 5283|       |  // write to buffer
 5284|   138k|  for (int i = 0; i < (txfm_size_col >> 3); i++) {
  ------------------
  |  Branch (5284:19): [True: 80.1k, False: 58.7k]
  ------------------
 5285|  80.1k|    highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, output + 8 * i,
 5286|  80.1k|                                   stride, ud_flip, txfm_size_row, bd);
 5287|  80.1k|  }
 5288|  58.7k|}
highbd_inv_txfm_sse4.c:highbd_inv_txfm2d_add_v_identity_ssse41:
 5293|   177k|                                                    const int bd) {
 5294|   177k|  __m128i buf1[64];
 5295|   177k|  int eobx, eoby;
 5296|   177k|  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
 5297|   177k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 5298|   177k|  const int txw_idx = get_txw_idx(tx_size);
 5299|   177k|  const int txh_idx = get_txh_idx(tx_size);
 5300|   177k|  const int txfm_size_col = tx_size_wide[tx_size];
 5301|   177k|  const int txfm_size_row = tx_size_high[tx_size];
 5302|   177k|  const int buf_size_w_div4 = AOMMIN(32, txfm_size_col) >> 2;
  ------------------
  |  |   34|   177k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 177k]
  |  |  ------------------
  ------------------
 5303|   177k|  const int row_max = AOMMIN(32, txfm_size_row);
  ------------------
  |  |   34|   177k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 177k]
  |  |  ------------------
  ------------------
 5304|   177k|  const int input_stride = row_max;
 5305|   177k|  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
 5306|   177k|  const int buf_size_nonzero_w = buf_size_nonzero_w_div8 << 3;
 5307|   177k|  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
 5308|   177k|  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
 5309|   177k|  const transform_1d_sse4_1 row_txfm =
 5310|   177k|      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
 5311|   177k|  const transform_1d_sse4_1 col_txfm =
 5312|   177k|      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
 5313|   177k|  int ud_flip, lr_flip;
 5314|   177k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 5315|       |
 5316|   620k|  for (int i = 0; i < (row_max >> 2); ++i) {
  ------------------
  |  Branch (5316:19): [True: 442k, False: 177k]
  ------------------
 5317|   442k|    __m128i buf0[16];
 5318|   442k|    load_buffer_32bit_input(input + i * 4, input_stride, buf0,
 5319|   442k|                            buf_size_nonzero_w);
 5320|   442k|    if (rect_type == 1 || rect_type == -1) {
  ------------------
  |  Branch (5320:9): [True: 95.6k, False: 347k]
  |  Branch (5320:27): [True: 70.6k, False: 276k]
  ------------------
 5321|   166k|      av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_nonzero_w, 0,
 5322|   166k|                                           NewInvSqrt2);
 5323|   166k|    }
 5324|   442k|    row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
  ------------------
  |  |   43|   442k|#define INV_COS_BIT 12
  ------------------
 5325|       |
 5326|   442k|    __m128i *_buf1 = buf1 + i * 4;
 5327|   442k|    if (lr_flip) {
  ------------------
  |  Branch (5327:9): [True: 30.1k, False: 412k]
  ------------------
 5328|   110k|      for (int j = 0; j < buf_size_w_div4; ++j) {
  ------------------
  |  Branch (5328:23): [True: 80.5k, False: 30.1k]
  ------------------
 5329|  80.5k|        TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
  ------------------
  |  |   18|  80.5k|  do {                                                \
  |  |   19|  80.5k|    __m128i u0, u1, u2, u3;                           \
  |  |   20|  80.5k|    u0 = _mm_unpacklo_epi32(x0, x1);                  \
  |  |   21|  80.5k|    u1 = _mm_unpackhi_epi32(x0, x1);                  \
  |  |   22|  80.5k|    u2 = _mm_unpacklo_epi32(x2, x3);                  \
  |  |   23|  80.5k|    u3 = _mm_unpackhi_epi32(x2, x3);                  \
  |  |   24|  80.5k|    y0 = _mm_unpacklo_epi64(u0, u2);                  \
  |  |   25|  80.5k|    y1 = _mm_unpackhi_epi64(u0, u2);                  \
  |  |   26|  80.5k|    y2 = _mm_unpacklo_epi64(u1, u3);                  \
  |  |   27|  80.5k|    y3 = _mm_unpackhi_epi64(u1, u3);                  \
  |  |   28|  80.5k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (28:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 5330|  80.5k|                      buf0[4 * j],
 5331|  80.5k|                      _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0],
 5332|  80.5k|                      _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1],
 5333|  80.5k|                      _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2],
 5334|  80.5k|                      _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]);
 5335|  80.5k|      }
 5336|   412k|    } else {
 5337|  1.62M|      for (int j = 0; j < buf_size_w_div4; ++j) {
  ------------------
  |  Branch (5337:23): [True: 1.21M, False: 412k]
  ------------------
 5338|  1.21M|        TRANSPOSE_4X4(
  ------------------
  |  |   18|  1.21M|  do {                                                \
  |  |   19|  1.21M|    __m128i u0, u1, u2, u3;                           \
  |  |   20|  1.21M|    u0 = _mm_unpacklo_epi32(x0, x1);                  \
  |  |   21|  1.21M|    u1 = _mm_unpackhi_epi32(x0, x1);                  \
  |  |   22|  1.21M|    u2 = _mm_unpacklo_epi32(x2, x3);                  \
  |  |   23|  1.21M|    u3 = _mm_unpackhi_epi32(x2, x3);                  \
  |  |   24|  1.21M|    y0 = _mm_unpacklo_epi64(u0, u2);                  \
  |  |   25|  1.21M|    y1 = _mm_unpackhi_epi64(u0, u2);                  \
  |  |   26|  1.21M|    y2 = _mm_unpacklo_epi64(u1, u3);                  \
  |  |   27|  1.21M|    y3 = _mm_unpackhi_epi64(u1, u3);                  \
  |  |   28|  1.21M|  } while (0)
  |  |  ------------------
  |  |  |  Branch (28:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 5339|  1.21M|            buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
 5340|  1.21M|            _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
 5341|  1.21M|            _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
 5342|  1.21M|      }
 5343|   412k|    }
 5344|   442k|  }
 5345|   680k|  for (int i = 0; i < buf_size_w_div4; i++) {
  ------------------
  |  Branch (5345:19): [True: 503k, False: 177k]
  ------------------
 5346|   503k|    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
  ------------------
  |  |   43|   503k|#define INV_COS_BIT 12
  ------------------
 5347|   503k|             bd, 0);
 5348|       |
 5349|   503k|    av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
 5350|   503k|                                    buf1 + i * txfm_size_row, txfm_size_row,
 5351|   503k|                                    -shift[1]);
 5352|   503k|  }
 5353|       |
 5354|       |  // write to buffer
 5355|   177k|  {
 5356|   428k|    for (int i = 0; i < (txfm_size_col >> 3); i++) {
  ------------------
  |  Branch (5356:21): [True: 251k, False: 177k]
  ------------------
 5357|   251k|      highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
 5358|   251k|                                     output + 8 * i, stride, ud_flip,
 5359|   251k|                                     txfm_size_row, bd);
 5360|   251k|    }
 5361|   177k|  }
 5362|   177k|}
highbd_inv_txfm_sse4.c:highbd_inv_txfm2d_add_idtx_ssse41:
 5366|   384k|                                              int eob, const int bd) {
 5367|   384k|  (void)eob;
 5368|   384k|  __m128i buf1[64 * 4];
 5369|   384k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 5370|   384k|  const int txw_idx = get_txw_idx(tx_size);
 5371|   384k|  const int txh_idx = get_txh_idx(tx_size);
 5372|   384k|  const int txfm_size_col = tx_size_wide[tx_size];
 5373|   384k|  const int txfm_size_row = tx_size_high[tx_size];
 5374|   384k|  const int row_max = AOMMIN(32, txfm_size_row);
  ------------------
  |  |   34|   384k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 384k]
  |  |  ------------------
  ------------------
 5375|   384k|  const int input_stride = row_max;
 5376|   384k|  const int buf_size_w = AOMMIN(32, txfm_size_col);
  ------------------
  |  |   34|   384k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 384k]
  |  |  ------------------
  ------------------
 5377|   384k|  const int buf_size_w_div4 = buf_size_w >> 2;
 5378|   384k|  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
 5379|   384k|  const transform_1d_sse4_1 row_txfm =
 5380|   384k|      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
 5381|   384k|  const transform_1d_sse4_1 col_txfm =
 5382|   384k|      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
 5383|       |
 5384|  1.42M|  for (int i = 0; i < (row_max >> 2); ++i) {
  ------------------
  |  Branch (5384:19): [True: 1.03M, False: 384k]
  ------------------
 5385|  1.03M|    __m128i buf0[32];
 5386|  1.03M|    load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w);
 5387|  1.03M|    if (rect_type == 1 || rect_type == -1) {
  ------------------
  |  Branch (5387:9): [True: 106k, False: 932k]
  |  Branch (5387:27): [True: 161k, False: 771k]
  ------------------
 5388|   268k|      av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_w, 0,
 5389|   268k|                                           NewInvSqrt2);
 5390|   268k|    }
 5391|  1.03M|    row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
  ------------------
  |  |   43|  1.03M|#define INV_COS_BIT 12
  ------------------
 5392|       |
 5393|  1.03M|    __m128i *_buf1 = buf1 + i * 4;
 5394|  4.21M|    for (int j = 0; j < buf_size_w_div4; ++j) {
  ------------------
  |  Branch (5394:21): [True: 3.17M, False: 1.03M]
  ------------------
 5395|  3.17M|      __m128i *buf0_cur = buf0 + j * 4;
 5396|  3.17M|      TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
  ------------------
  |  |   18|  3.17M|  do {                                                \
  |  |   19|  3.17M|    __m128i u0, u1, u2, u3;                           \
  |  |   20|  3.17M|    u0 = _mm_unpacklo_epi32(x0, x1);                  \
  |  |   21|  3.17M|    u1 = _mm_unpackhi_epi32(x0, x1);                  \
  |  |   22|  3.17M|    u2 = _mm_unpacklo_epi32(x2, x3);                  \
  |  |   23|  3.17M|    u3 = _mm_unpackhi_epi32(x2, x3);                  \
  |  |   24|  3.17M|    y0 = _mm_unpacklo_epi64(u0, u2);                  \
  |  |   25|  3.17M|    y1 = _mm_unpackhi_epi64(u0, u2);                  \
  |  |   26|  3.17M|    y2 = _mm_unpacklo_epi64(u1, u3);                  \
  |  |   27|  3.17M|    y3 = _mm_unpackhi_epi64(u1, u3);                  \
  |  |   28|  3.17M|  } while (0)
  |  |  ------------------
  |  |  |  Branch (28:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 5397|  3.17M|                    buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
 5398|  3.17M|      _buf1[j * txfm_size_row + 0] = buf0_cur[0];
 5399|  3.17M|      _buf1[j * txfm_size_row + 1] = buf0_cur[1];
 5400|  3.17M|      _buf1[j * txfm_size_row + 2] = buf0_cur[2];
 5401|  3.17M|      _buf1[j * txfm_size_row + 3] = buf0_cur[3];
 5402|  3.17M|    }
 5403|  1.03M|  }
 5404|  1.47M|  for (int i = 0; i < buf_size_w_div4; i++) {
  ------------------
  |  Branch (5404:19): [True: 1.08M, False: 384k]
  ------------------
 5405|  1.08M|    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
  ------------------
  |  |   43|  1.08M|#define INV_COS_BIT 12
  ------------------
 5406|  1.08M|             bd, 0);
 5407|       |
 5408|  1.08M|    av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
 5409|  1.08M|                                    buf1 + i * txfm_size_row, txfm_size_row,
 5410|  1.08M|                                    -shift[1]);
 5411|  1.08M|  }
 5412|       |
 5413|       |  // write to buffer
 5414|   384k|  {
 5415|   927k|    for (int i = 0; i < (txfm_size_col >> 3); i++) {
  ------------------
  |  Branch (5415:21): [True: 543k, False: 384k]
  ------------------
 5416|   543k|      highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
 5417|   543k|                                     output + 8 * i, stride, 0, txfm_size_row,
 5418|   543k|                                     bd);
 5419|   543k|    }
 5420|   384k|  }
 5421|   384k|}
highbd_inv_txfm_sse4.c:av1_highbd_inv_txfm_add_4x8_sse4_1:
 5759|   496k|                                               const TxfmParam *txfm_param) {
 5760|   496k|  int bd = txfm_param->bd;
 5761|   496k|  const TX_TYPE tx_type = txfm_param->tx_type;
 5762|   496k|  const TX_SIZE tx_size = txfm_param->tx_size;
 5763|   496k|  int eob = txfm_param->eob;
 5764|   496k|  highbd_inv_txfm2d_add_4x8_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
  ------------------
  |  |   75|   496k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 5765|   496k|                                  tx_type, tx_size, eob, bd);
 5766|   496k|}
highbd_inv_txfm_sse4.c:highbd_inv_txfm2d_add_4x8_sse41:
 5506|   496k|                                            int eob, const int bd) {
 5507|   496k|  (void)eob;
 5508|   496k|  __m128i buf1[8];
 5509|   496k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 5510|   496k|  const int txw_idx = get_txw_idx(tx_size);
 5511|   496k|  const int txh_idx = get_txh_idx(tx_size);
 5512|   496k|  const int txfm_size_col = tx_size_wide[tx_size];
 5513|   496k|  const int txfm_size_row = tx_size_high[tx_size];
 5514|   496k|  const transform_1d_sse4_1 row_txfm =
 5515|   496k|      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
 5516|   496k|  const transform_1d_sse4_1 col_txfm =
 5517|   496k|      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][1];
 5518|   496k|  const int input_stride = AOMMIN(32, txfm_size_row);
  ------------------
  |  |   34|   496k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 496k]
  |  |  ------------------
  ------------------
 5519|       |
 5520|   496k|  assert(col_txfm != NULL);
 5521|   496k|  assert(row_txfm != NULL);
 5522|   496k|  int ud_flip, lr_flip;
 5523|   496k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 5524|       |
 5525|       |  // 1st stage: column transform
 5526|   496k|  __m128i buf0[8];
 5527|   496k|  load_buffer_32bit_input(input, input_stride, buf0, txfm_size_col);
 5528|   496k|  load_buffer_32bit_input(input + 4, input_stride, buf0 + 4, txfm_size_col);
 5529|   496k|  av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_row, 0,
 5530|   496k|                                       NewInvSqrt2);
 5531|   496k|  row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
  ------------------
  |  |   43|   496k|#define INV_COS_BIT 12
  ------------------
 5532|   496k|  row_txfm(buf0 + 4, buf0 + 4, INV_COS_BIT, 0, bd, -shift[0]);
  ------------------
  |  |   43|   496k|#define INV_COS_BIT 12
  ------------------
 5533|       |
 5534|   496k|  if (lr_flip) {
  ------------------
  |  Branch (5534:7): [True: 62.1k, False: 433k]
  ------------------
 5535|  62.1k|    TRANSPOSE_4X4(buf0[3], buf0[2], buf0[1], buf0[0], buf1[0], buf1[1], buf1[2],
  ------------------
  |  |   18|  62.1k|  do {                                                \
  |  |   19|  62.1k|    __m128i u0, u1, u2, u3;                           \
  |  |   20|  62.1k|    u0 = _mm_unpacklo_epi32(x0, x1);                  \
  |  |   21|  62.1k|    u1 = _mm_unpackhi_epi32(x0, x1);                  \
  |  |   22|  62.1k|    u2 = _mm_unpacklo_epi32(x2, x3);                  \
  |  |   23|  62.1k|    u3 = _mm_unpackhi_epi32(x2, x3);                  \
  |  |   24|  62.1k|    y0 = _mm_unpacklo_epi64(u0, u2);                  \
  |  |   25|  62.1k|    y1 = _mm_unpackhi_epi64(u0, u2);                  \
  |  |   26|  62.1k|    y2 = _mm_unpacklo_epi64(u1, u3);                  \
  |  |   27|  62.1k|    y3 = _mm_unpackhi_epi64(u1, u3);                  \
  |  |   28|  62.1k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (28:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 5536|  62.1k|                  buf1[3]);
 5537|       |
 5538|  62.1k|    TRANSPOSE_4X4(buf0[7], buf0[6], buf0[5], buf0[4], buf1[4], buf1[5], buf1[6],
  ------------------
  |  |   18|  62.1k|  do {                                                \
  |  |   19|  62.1k|    __m128i u0, u1, u2, u3;                           \
  |  |   20|  62.1k|    u0 = _mm_unpacklo_epi32(x0, x1);                  \
  |  |   21|  62.1k|    u1 = _mm_unpackhi_epi32(x0, x1);                  \
  |  |   22|  62.1k|    u2 = _mm_unpacklo_epi32(x2, x3);                  \
  |  |   23|  62.1k|    u3 = _mm_unpackhi_epi32(x2, x3);                  \
  |  |   24|  62.1k|    y0 = _mm_unpacklo_epi64(u0, u2);                  \
  |  |   25|  62.1k|    y1 = _mm_unpackhi_epi64(u0, u2);                  \
  |  |   26|  62.1k|    y2 = _mm_unpacklo_epi64(u1, u3);                  \
  |  |   27|  62.1k|    y3 = _mm_unpackhi_epi64(u1, u3);                  \
  |  |   28|  62.1k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (28:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 5539|  62.1k|                  buf1[7]);
 5540|   433k|  } else {
 5541|   433k|    TRANSPOSE_4X4(buf0[0], buf0[1], buf0[2], buf0[3], buf1[0], buf1[1], buf1[2],
  ------------------
  |  |   18|   433k|  do {                                                \
  |  |   19|   433k|    __m128i u0, u1, u2, u3;                           \
  |  |   20|   433k|    u0 = _mm_unpacklo_epi32(x0, x1);                  \
  |  |   21|   433k|    u1 = _mm_unpackhi_epi32(x0, x1);                  \
  |  |   22|   433k|    u2 = _mm_unpacklo_epi32(x2, x3);                  \
  |  |   23|   433k|    u3 = _mm_unpackhi_epi32(x2, x3);                  \
  |  |   24|   433k|    y0 = _mm_unpacklo_epi64(u0, u2);                  \
  |  |   25|   433k|    y1 = _mm_unpackhi_epi64(u0, u2);                  \
  |  |   26|   433k|    y2 = _mm_unpacklo_epi64(u1, u3);                  \
  |  |   27|   433k|    y3 = _mm_unpackhi_epi64(u1, u3);                  \
  |  |   28|   433k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (28:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 5542|   433k|                  buf1[3]);
 5543|       |
 5544|   433k|    TRANSPOSE_4X4(buf0[4], buf0[5], buf0[6], buf0[7], buf1[4], buf1[5], buf1[6],
  ------------------
  |  |   18|   433k|  do {                                                \
  |  |   19|   433k|    __m128i u0, u1, u2, u3;                           \
  |  |   20|   433k|    u0 = _mm_unpacklo_epi32(x0, x1);                  \
  |  |   21|   433k|    u1 = _mm_unpackhi_epi32(x0, x1);                  \
  |  |   22|   433k|    u2 = _mm_unpacklo_epi32(x2, x3);                  \
  |  |   23|   433k|    u3 = _mm_unpackhi_epi32(x2, x3);                  \
  |  |   24|   433k|    y0 = _mm_unpacklo_epi64(u0, u2);                  \
  |  |   25|   433k|    y1 = _mm_unpackhi_epi64(u0, u2);                  \
  |  |   26|   433k|    y2 = _mm_unpacklo_epi64(u1, u3);                  \
  |  |   27|   433k|    y3 = _mm_unpackhi_epi64(u1, u3);                  \
  |  |   28|   433k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (28:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 5545|   433k|                  buf1[7]);
 5546|   433k|  }
 5547|       |
 5548|       |  // 2nd stage: column transform
 5549|   496k|  col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|   496k|#define INV_COS_BIT 12
  ------------------
 5550|       |
 5551|   496k|  av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
 5552|       |
 5553|       |  // write to buffer
 5554|   496k|  highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row,
 5555|   496k|                                 bd);
 5556|   496k|}
highbd_inv_txfm_sse4.c:highbd_write_buffer_4xn_sse4_1:
  110|   925k|                                                  int height, const int bd) {
  111|   925k|  int j = flipud ? (height - 1) : 0;
  ------------------
  |  Branch (111:11): [True: 69.4k, False: 856k]
  ------------------
  112|   925k|  const int step = flipud ? -1 : 1;
  ------------------
  |  Branch (112:20): [True: 69.4k, False: 856k]
  ------------------
  113|  11.7M|  for (int i = 0; i < height; ++i, j += step) {
  ------------------
  |  Branch (113:19): [True: 10.8M, False: 925k]
  ------------------
  114|  10.8M|    __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride));
  115|  10.8M|    __m128i u = highbd_get_recon_4xn_sse4_1(v, in[j], bd);
  116|       |
  117|  10.8M|    _mm_storel_epi64((__m128i *)(output + i * stride), u);
  118|  10.8M|  }
  119|   925k|}
highbd_inv_txfm_sse4.c:highbd_get_recon_4xn_sse4_1:
   99|  10.8M|                                                  __m128i res0, const int bd) {
  100|  10.8M|  __m128i x0 = _mm_cvtepi16_epi32(pred);
  101|       |
  102|  10.8M|  x0 = _mm_add_epi32(res0, x0);
  103|  10.8M|  x0 = _mm_packus_epi32(x0, x0);
  104|  10.8M|  x0 = highbd_clamp_epi16(x0, bd);
  105|  10.8M|  return x0;
  106|  10.8M|}
highbd_inv_txfm_sse4.c:av1_highbd_inv_txfm_add_8x4_sse4_1:
 5770|   937k|                                               const TxfmParam *txfm_param) {
 5771|   937k|  int bd = txfm_param->bd;
 5772|   937k|  const TX_TYPE tx_type = txfm_param->tx_type;
 5773|   937k|  const TX_SIZE tx_size = txfm_param->tx_size;
 5774|   937k|  int eob = txfm_param->eob;
 5775|   937k|  highbd_inv_txfm2d_add_8x4_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
  ------------------
  |  |   75|   937k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 5776|   937k|                                  tx_type, tx_size, eob, bd);
 5777|   937k|}
highbd_inv_txfm_sse4.c:highbd_inv_txfm2d_add_8x4_sse41:
 5561|   937k|                                            int eob, const int bd) {
 5562|   937k|  (void)eob;
 5563|   937k|  __m128i buf1[8];
 5564|   937k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 5565|   937k|  const int txw_idx = get_txw_idx(tx_size);
 5566|   937k|  const int txh_idx = get_txh_idx(tx_size);
 5567|   937k|  const int txfm_size_col = tx_size_wide[tx_size];
 5568|   937k|  const int txfm_size_row = tx_size_high[tx_size];
 5569|   937k|  const transform_1d_sse4_1 row_txfm =
 5570|   937k|      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][1];
 5571|   937k|  const transform_1d_sse4_1 col_txfm =
 5572|   937k|      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
 5573|       |
 5574|   937k|  assert(col_txfm != NULL);
 5575|   937k|  assert(row_txfm != NULL);
 5576|   937k|  int ud_flip, lr_flip;
 5577|   937k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 5578|       |
 5579|       |  // 1st stage: column transform
 5580|   937k|  __m128i buf0[8];
 5581|   937k|  const int32_t *input_row = input;
 5582|   937k|  load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
 5583|       |
 5584|   937k|  av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_col, 0,
 5585|   937k|                                       NewInvSqrt2);
 5586|   937k|  row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
  ------------------
  |  |   43|   937k|#define INV_COS_BIT 12
  ------------------
 5587|       |
 5588|   937k|  __m128i *buf1_ptr;
 5589|   937k|  if (lr_flip) {
  ------------------
  |  Branch (5589:7): [True: 93.4k, False: 843k]
  ------------------
 5590|  93.4k|    flip_buf_sse2(buf0, buf1, txfm_size_col);
 5591|  93.4k|    buf1_ptr = buf1;
 5592|   843k|  } else {
 5593|   843k|    buf1_ptr = buf0;
 5594|   843k|  }
 5595|       |
 5596|       |  // 2nd stage: column transform
 5597|  2.81M|  for (int i = 0; i < 2; i++) {
  ------------------
  |  Branch (5597:19): [True: 1.87M, False: 937k]
  ------------------
 5598|  1.87M|    __m128i *buf1_cur = buf1_ptr + i * txfm_size_row;
 5599|  1.87M|    transpose_32bit_4x4(buf1_cur, buf1_cur);
 5600|  1.87M|    col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|  1.87M|#define INV_COS_BIT 12
  ------------------
 5601|  1.87M|  }
 5602|   937k|  av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
 5603|       |  // write to buffer
 5604|   937k|  highbd_write_buffer_8xn_sse4_1(buf1_ptr, output, stride, ud_flip,
 5605|   937k|                                 txfm_size_row, bd);
 5606|   937k|}
highbd_inv_txfm_sse4.c:av1_highbd_inv_txfm_add_4x4_sse4_1:
 5154|  2.37M|                                               const TxfmParam *txfm_param) {
 5155|  2.37M|  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
 5156|  2.37M|  int eob = txfm_param->eob;
 5157|  2.37M|  int bd = txfm_param->bd;
 5158|  2.37M|  int lossless = txfm_param->lossless;
 5159|  2.37M|  const int32_t *src = cast_to_int32(input);
 5160|  2.37M|  const TX_TYPE tx_type = txfm_param->tx_type;
 5161|  2.37M|  if (lossless) {
  ------------------
  |  Branch (5161:7): [True: 1.04M, False: 1.32M]
  ------------------
 5162|  1.04M|    assert(tx_type == DCT_DCT);
 5163|  1.04M|    av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
 5164|  1.04M|    return;
 5165|  1.04M|  }
 5166|  1.32M|  av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
  ------------------
  |  |   75|  1.32M|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 5167|  1.32M|                                bd);
 5168|  1.32M|}
highbd_inv_txfm_sse4.c:av1_highbd_inv_txfm_add_16x4_sse4_1:
 5792|   675k|                                                const TxfmParam *txfm_param) {
 5793|   675k|  int bd = txfm_param->bd;
 5794|   675k|  const TX_TYPE tx_type = txfm_param->tx_type;
 5795|   675k|  const TX_SIZE tx_size = txfm_param->tx_size;
 5796|   675k|  int eob = txfm_param->eob;
 5797|   675k|  highbd_inv_txfm2d_add_16x4_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride,
  ------------------
  |  |   75|   675k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 5798|   675k|                                    tx_type, tx_size, eob, bd);
 5799|   675k|}
highbd_inv_txfm_sse4.c:highbd_inv_txfm2d_add_16x4_sse4_1:
 5667|   675k|                                              int eob, const int bd) {
 5668|   675k|  (void)eob;
 5669|   675k|  __m128i buf1[16];
 5670|   675k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 5671|   675k|  const int txw_idx = get_txw_idx(tx_size);
 5672|   675k|  const int txh_idx = get_txh_idx(tx_size);
 5673|   675k|  const int txfm_size_col = tx_size_wide[tx_size];
 5674|   675k|  const int txfm_size_row = tx_size_high[tx_size];
 5675|   675k|  const int buf_size_w_div8 = txfm_size_col >> 2;
 5676|   675k|  const transform_1d_sse4_1 row_txfm =
 5677|   675k|      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2];
 5678|   675k|  const transform_1d_sse4_1 col_txfm =
 5679|   675k|      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
 5680|       |
 5681|   675k|  assert(col_txfm != NULL);
 5682|   675k|  assert(row_txfm != NULL);
 5683|   675k|  int ud_flip, lr_flip;
 5684|   675k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 5685|       |
 5686|       |  // 1st stage: column transform
 5687|   675k|  __m128i buf0[16];
 5688|   675k|  const int32_t *input_row = input;
 5689|   675k|  load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
 5690|       |
 5691|   675k|  row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
  ------------------
  |  |   43|   675k|#define INV_COS_BIT 12
  ------------------
 5692|       |
 5693|   675k|  __m128i *buf1_ptr;
 5694|   675k|  if (lr_flip) {
  ------------------
  |  Branch (5694:7): [True: 47.7k, False: 627k]
  ------------------
 5695|  47.7k|    flip_buf_sse2(buf0, buf1, txfm_size_col);
 5696|  47.7k|    buf1_ptr = buf1;
 5697|   627k|  } else {
 5698|   627k|    buf1_ptr = buf0;
 5699|   627k|  }
 5700|       |
 5701|       |  // 2nd stage: column transform
 5702|  3.37M|  for (int i = 0; i < buf_size_w_div8; i++) {
  ------------------
  |  Branch (5702:19): [True: 2.69M, False: 675k]
  ------------------
 5703|  2.69M|    __m128i *buf1_cur = buf1_ptr + i * txfm_size_row;
 5704|  2.69M|    transpose_32bit_4x4(buf1_cur, buf1_cur);
 5705|  2.69M|    col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|  2.69M|#define INV_COS_BIT 12
  ------------------
 5706|  2.69M|  }
 5707|   675k|  av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
 5708|       |
 5709|       |  // write to buffer
 5710|  2.02M|  for (int i = 0; i < (txfm_size_col >> 3); i++) {
  ------------------
  |  Branch (5710:19): [True: 1.35M, False: 675k]
  ------------------
 5711|  1.35M|    highbd_write_buffer_8xn_sse4_1(buf1_ptr + i * txfm_size_row * 2,
 5712|  1.35M|                                   output + 8 * i, stride, ud_flip,
 5713|  1.35M|                                   txfm_size_row, bd);
 5714|  1.35M|  }
 5715|   675k|}
highbd_inv_txfm_sse4.c:av1_highbd_inv_txfm_add_4x16_sse4_1:
 5781|   429k|                                                const TxfmParam *txfm_param) {
 5782|   429k|  int bd = txfm_param->bd;
 5783|   429k|  const TX_TYPE tx_type = txfm_param->tx_type;
 5784|   429k|  const TX_SIZE tx_size = txfm_param->tx_size;
 5785|   429k|  int eob = txfm_param->eob;
 5786|   429k|  highbd_inv_txfm2d_add_4x16_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride,
  ------------------
  |  |   75|   429k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 5787|   429k|                                    tx_type, tx_size, eob, bd);
 5788|   429k|}
highbd_inv_txfm_sse4.c:highbd_inv_txfm2d_add_4x16_sse4_1:
 5611|   429k|                                              int eob, const int bd) {
 5612|   429k|  (void)eob;
 5613|   429k|  __m128i buf1[16];
 5614|   429k|  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
 5615|   429k|  const int txw_idx = get_txw_idx(tx_size);
 5616|   429k|  const int txh_idx = get_txh_idx(tx_size);
 5617|   429k|  const int txfm_size_col = tx_size_wide[tx_size];
 5618|   429k|  const int txfm_size_row = tx_size_high[tx_size];
 5619|   429k|  const int buf_size_h_div8 = txfm_size_row >> 2;
 5620|   429k|  const transform_1d_sse4_1 row_txfm =
 5621|   429k|      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
 5622|   429k|  const transform_1d_sse4_1 col_txfm =
 5623|   429k|      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2];
 5624|   429k|  const int input_stride = AOMMIN(32, txfm_size_row);
  ------------------
  |  |   34|   429k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 429k]
  |  |  ------------------
  ------------------
 5625|       |
 5626|   429k|  assert(col_txfm != NULL);
 5627|   429k|  assert(row_txfm != NULL);
 5628|   429k|  int ud_flip, lr_flip;
 5629|   429k|  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 5630|       |
 5631|       |  // 1st stage: column transform
 5632|   429k|  __m128i buf0[16];
 5633|  2.14M|  for (int i = 0; i < (txfm_size_row >> 2); i++) {
  ------------------
  |  Branch (5633:19): [True: 1.71M, False: 429k]
  ------------------
 5634|  1.71M|    const int32_t *input_row = input + i * 4;
 5635|  1.71M|    __m128i *buf0_cur = buf0 + i * 4;
 5636|  1.71M|    load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_col);
 5637|  1.71M|    row_txfm(buf0_cur, buf0_cur, INV_COS_BIT, 0, bd, -shift[0]);
  ------------------
  |  |   43|  1.71M|#define INV_COS_BIT 12
  ------------------
 5638|  1.71M|  }
 5639|       |
 5640|   429k|  if (lr_flip) {
  ------------------
  |  Branch (5640:7): [True: 43.9k, False: 385k]
  ------------------
 5641|   219k|    for (int j = 0; j < buf_size_h_div8; ++j) {
  ------------------
  |  Branch (5641:21): [True: 175k, False: 43.9k]
  ------------------
 5642|   175k|      TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
  ------------------
  |  |   18|   175k|  do {                                                \
  |  |   19|   175k|    __m128i u0, u1, u2, u3;                           \
  |  |   20|   175k|    u0 = _mm_unpacklo_epi32(x0, x1);                  \
  |  |   21|   175k|    u1 = _mm_unpackhi_epi32(x0, x1);                  \
  |  |   22|   175k|    u2 = _mm_unpacklo_epi32(x2, x3);                  \
  |  |   23|   175k|    u3 = _mm_unpackhi_epi32(x2, x3);                  \
  |  |   24|   175k|    y0 = _mm_unpacklo_epi64(u0, u2);                  \
  |  |   25|   175k|    y1 = _mm_unpackhi_epi64(u0, u2);                  \
  |  |   26|   175k|    y2 = _mm_unpacklo_epi64(u1, u3);                  \
  |  |   27|   175k|    y3 = _mm_unpackhi_epi64(u1, u3);                  \
  |  |   28|   175k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (28:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 5643|   175k|                    buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2],
 5644|   175k|                    buf1[4 * j + 3]);
 5645|   175k|    }
 5646|   385k|  } else {
 5647|  1.92M|    for (int j = 0; j < buf_size_h_div8; ++j) {
  ------------------
  |  Branch (5647:21): [True: 1.54M, False: 385k]
  ------------------
 5648|  1.54M|      TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2],
  ------------------
  |  |   18|  1.54M|  do {                                                \
  |  |   19|  1.54M|    __m128i u0, u1, u2, u3;                           \
  |  |   20|  1.54M|    u0 = _mm_unpacklo_epi32(x0, x1);                  \
  |  |   21|  1.54M|    u1 = _mm_unpackhi_epi32(x0, x1);                  \
  |  |   22|  1.54M|    u2 = _mm_unpacklo_epi32(x2, x3);                  \
  |  |   23|  1.54M|    u3 = _mm_unpackhi_epi32(x2, x3);                  \
  |  |   24|  1.54M|    y0 = _mm_unpacklo_epi64(u0, u2);                  \
  |  |   25|  1.54M|    y1 = _mm_unpackhi_epi64(u0, u2);                  \
  |  |   26|  1.54M|    y2 = _mm_unpacklo_epi64(u1, u3);                  \
  |  |   27|  1.54M|    y3 = _mm_unpackhi_epi64(u1, u3);                  \
  |  |   28|  1.54M|  } while (0)
  |  |  ------------------
  |  |  |  Branch (28:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 5649|  1.54M|                    buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1],
 5650|  1.54M|                    buf1[4 * j + 2], buf1[4 * j + 3]);
 5651|  1.54M|    }
 5652|   385k|  }
 5653|       |
 5654|       |  // 2nd stage: column transform
 5655|   429k|  col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0);
  ------------------
  |  |   43|   429k|#define INV_COS_BIT 12
  ------------------
 5656|       |
 5657|   429k|  av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
 5658|       |
 5659|       |  // write to buffer
 5660|   429k|  highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row,
 5661|   429k|                                 bd);
 5662|   429k|}

av1_highbd_dist_wtd_convolve_2d_copy_avx2:
   29|   312k|                                               int bd) {
   30|   312k|  CONV_BUF_TYPE *dst = conv_params->dst;
   31|   312k|  int dst_stride = conv_params->dst_stride;
   32|       |
   33|   312k|  const int bits =
   34|   312k|      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
  ------------------
  |  |   21|   312k|#define FILTER_BITS 7
  ------------------
   35|   312k|  const __m128i left_shift = _mm_cvtsi32_si128(bits);
   36|   312k|  const int do_average = conv_params->do_average;
   37|   312k|  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   38|   312k|  const int w0 = conv_params->fwd_offset;
   39|   312k|  const int w1 = conv_params->bck_offset;
   40|   312k|  const __m256i wt0 = _mm256_set1_epi32(w0);
   41|   312k|  const __m256i wt1 = _mm256_set1_epi32(w1);
   42|   312k|  const __m256i zero = _mm256_setzero_si256();
   43|   312k|  int i, j;
   44|       |
   45|   312k|  const int offset_0 =
   46|   312k|      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|   312k|#define FILTER_BITS 7
  ------------------
   47|   312k|  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
   48|   312k|  const __m256i offset_const = _mm256_set1_epi32(offset);
   49|   312k|  const __m256i offset_const_16b = _mm256_set1_epi16(offset);
   50|   312k|  const int rounding_shift =
   51|   312k|      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|   312k|#define FILTER_BITS 7
  ------------------
   52|   312k|  const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
   53|   312k|  const __m256i clip_pixel_to_bd =
   54|   312k|      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
  ------------------
  |  Branch (54:25): [True: 310k, False: 1.87k]
  |  Branch (54:44): [True: 1.87k, False: 0]
  ------------------
   55|       |
   56|   312k|  assert(bits <= 4);
   57|       |
   58|   312k|  if (!(w % 16)) {
  ------------------
  |  Branch (58:7): [True: 113k, False: 199k]
  ------------------
   59|  2.55M|    for (i = 0; i < h; i += 1) {
  ------------------
  |  Branch (59:17): [True: 2.43M, False: 113k]
  ------------------
   60|  9.03M|      for (j = 0; j < w; j += 16) {
  ------------------
  |  Branch (60:19): [True: 6.59M, False: 2.43M]
  ------------------
   61|  6.59M|        const __m256i src_16bit =
   62|  6.59M|            _mm256_loadu_si256((__m256i *)(&src[i * src_stride + j]));
   63|       |
   64|  6.59M|        const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
   65|       |
   66|  6.59M|        if (do_average) {
  ------------------
  |  Branch (66:13): [True: 2.60M, False: 3.98M]
  ------------------
   67|  2.60M|          const __m256i data_0 =
   68|  2.60M|              _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j]));
   69|       |
   70|  2.60M|          const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_0, zero);
   71|  2.60M|          const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_0, zero);
   72|       |
   73|  2.60M|          const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero);
   74|  2.60M|          const __m256i res_unsigned_lo =
   75|  2.60M|              _mm256_add_epi32(res_32b_lo, offset_const);
   76|       |
   77|  2.60M|          const __m256i comp_avg_res_lo =
   78|  2.60M|              highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
   79|  2.60M|                              use_dist_wtd_comp_avg);
   80|       |
   81|  2.60M|          const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
   82|  2.60M|          const __m256i res_unsigned_hi =
   83|  2.60M|              _mm256_add_epi32(res_32b_hi, offset_const);
   84|       |
   85|  2.60M|          const __m256i comp_avg_res_hi =
   86|  2.60M|              highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
   87|  2.60M|                              use_dist_wtd_comp_avg);
   88|       |
   89|  2.60M|          const __m256i round_result_lo = highbd_convolve_rounding(
   90|  2.60M|              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
   91|  2.60M|          const __m256i round_result_hi = highbd_convolve_rounding(
   92|  2.60M|              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
   93|       |
   94|  2.60M|          const __m256i res_16b =
   95|  2.60M|              _mm256_packus_epi32(round_result_lo, round_result_hi);
   96|  2.60M|          const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
   97|       |
   98|  2.60M|          _mm256_store_si256((__m256i *)(&dst0[i * dst_stride0 + j]), res_clip);
   99|  3.98M|        } else {
  100|  3.98M|          const __m256i res_unsigned_16b =
  101|  3.98M|              _mm256_adds_epu16(res, offset_const_16b);
  102|       |
  103|  3.98M|          _mm256_store_si256((__m256i *)(&dst[i * dst_stride + j]),
  104|  3.98M|                             res_unsigned_16b);
  105|  3.98M|        }
  106|  6.59M|      }
  107|  2.43M|    }
  108|   199k|  } else if (!(w % 4)) {
  ------------------
  |  Branch (108:14): [True: 199k, False: 18.4E]
  ------------------
  109|   968k|    for (i = 0; i < h; i += 2) {
  ------------------
  |  Branch (109:17): [True: 769k, False: 199k]
  ------------------
  110|  1.53M|      for (j = 0; j < w; j += 8) {
  ------------------
  |  Branch (110:19): [True: 769k, False: 769k]
  ------------------
  111|   769k|        const __m128i src_row_0 =
  112|   769k|            _mm_loadu_si128((__m128i *)(&src[i * src_stride + j]));
  113|   769k|        const __m128i src_row_1 =
  114|   769k|            _mm_loadu_si128((__m128i *)(&src[i * src_stride + j + src_stride]));
  115|       |        // since not all compilers yet support _mm256_set_m128i()
  116|   769k|        const __m256i src_10 = _mm256_insertf128_si256(
  117|   769k|            _mm256_castsi128_si256(src_row_0), src_row_1, 1);
  118|       |
  119|   769k|        const __m256i res = _mm256_sll_epi16(src_10, left_shift);
  120|       |
  121|   769k|        if (w - j < 8) {
  ------------------
  |  Branch (121:13): [True: 244k, False: 524k]
  ------------------
  122|   244k|          if (do_average) {
  ------------------
  |  Branch (122:15): [True: 80.2k, False: 164k]
  ------------------
  123|  80.2k|            const __m256i data_0 = _mm256_castsi128_si256(
  124|  80.2k|                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
  125|  80.2k|            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
  126|  80.2k|                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
  127|  80.2k|            const __m256i data_01 =
  128|  80.2k|                _mm256_permute2x128_si256(data_0, data_1, 0x20);
  129|       |
  130|  80.2k|            const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
  131|       |
  132|  80.2k|            const __m256i res_32b = _mm256_unpacklo_epi16(res, zero);
  133|  80.2k|            const __m256i res_unsigned_lo =
  134|  80.2k|                _mm256_add_epi32(res_32b, offset_const);
  135|       |
  136|  80.2k|            const __m256i comp_avg_res =
  137|  80.2k|                highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
  138|  80.2k|                                use_dist_wtd_comp_avg);
  139|       |
  140|  80.2k|            const __m256i round_result = highbd_convolve_rounding(
  141|  80.2k|                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
  142|       |
  143|  80.2k|            const __m256i res_16b =
  144|  80.2k|                _mm256_packus_epi32(round_result, round_result);
  145|  80.2k|            const __m256i res_clip =
  146|  80.2k|                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
  147|       |
  148|  80.2k|            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
  149|  80.2k|            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
  150|       |
  151|  80.2k|            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  152|  80.2k|            _mm_storel_epi64(
  153|  80.2k|                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
  154|   164k|          } else {
  155|   164k|            const __m256i res_unsigned_16b =
  156|   164k|                _mm256_adds_epu16(res, offset_const_16b);
  157|       |
  158|   164k|            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b);
  159|   164k|            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1);
  160|       |
  161|   164k|            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
  162|   164k|            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  163|   164k|                             res_1);
  164|   164k|          }
  165|   524k|        } else {
  166|   524k|          if (do_average) {
  ------------------
  |  Branch (166:15): [True: 171k, False: 352k]
  ------------------
  167|   171k|            const __m256i data_0 = _mm256_castsi128_si256(
  168|   171k|                _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
  169|   171k|            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
  170|   171k|                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
  171|   171k|            const __m256i data_01 =
  172|   171k|                _mm256_permute2x128_si256(data_0, data_1, 0x20);
  173|       |
  174|   171k|            const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
  175|   171k|            const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
  176|       |
  177|   171k|            const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero);
  178|   171k|            const __m256i res_unsigned_lo =
  179|   171k|                _mm256_add_epi32(res_32b_lo, offset_const);
  180|       |
  181|   171k|            const __m256i comp_avg_res_lo =
  182|   171k|                highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
  183|   171k|                                use_dist_wtd_comp_avg);
  184|       |
  185|   171k|            const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
  186|   171k|            const __m256i res_unsigned_hi =
  187|   171k|                _mm256_add_epi32(res_32b_hi, offset_const);
  188|       |
  189|   171k|            const __m256i comp_avg_res_hi =
  190|   171k|                highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
  191|   171k|                                use_dist_wtd_comp_avg);
  192|       |
  193|   171k|            const __m256i round_result_lo =
  194|   171k|                highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
  195|   171k|                                         &rounding_const, rounding_shift);
  196|   171k|            const __m256i round_result_hi =
  197|   171k|                highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
  198|   171k|                                         &rounding_const, rounding_shift);
  199|       |
  200|   171k|            const __m256i res_16b =
  201|   171k|                _mm256_packus_epi32(round_result_lo, round_result_hi);
  202|   171k|            const __m256i res_clip =
  203|   171k|                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
  204|       |
  205|   171k|            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
  206|   171k|            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
  207|       |
  208|   171k|            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  209|   171k|            _mm_store_si128(
  210|   171k|                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
  211|   352k|          } else {
  212|   352k|            const __m256i res_unsigned_16b =
  213|   352k|                _mm256_adds_epu16(res, offset_const_16b);
  214|   352k|            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b);
  215|   352k|            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1);
  216|       |
  217|   352k|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
  218|   352k|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  219|   352k|                            res_1);
  220|   352k|          }
  221|   524k|        }
  222|   769k|      }
  223|   769k|    }
  224|   199k|  }
  225|   312k|}
av1_highbd_dist_wtd_convolve_2d_avx2:
  231|   764k|    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
  232|   764k|  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
  ------------------
  |  |   19|   764k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
  233|   764k|  CONV_BUF_TYPE *dst = conv_params->dst;
  234|   764k|  int dst_stride = conv_params->dst_stride;
  235|   764k|  int im_h = h + filter_params_y->taps - 1;
  236|   764k|  int im_stride = 8;
  237|   764k|  int i, j;
  238|   764k|  const int fo_vert = filter_params_y->taps / 2 - 1;
  239|   764k|  const int fo_horiz = filter_params_x->taps / 2 - 1;
  240|   764k|  const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
  241|       |
  242|       |  // Check that, even with 12-bit input, the intermediate values will fit
  243|       |  // into an unsigned 16-bit intermediate array.
  244|   764k|  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
  245|       |
  246|   764k|  __m256i s[8], coeffs_y[4], coeffs_x[4];
  247|   764k|  const int do_average = conv_params->do_average;
  248|   764k|  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
  249|       |
  250|   764k|  const int w0 = conv_params->fwd_offset;
  251|   764k|  const int w1 = conv_params->bck_offset;
  252|   764k|  const __m256i wt0 = _mm256_set1_epi32(w0);
  253|   764k|  const __m256i wt1 = _mm256_set1_epi32(w1);
  254|   764k|  const __m256i zero = _mm256_setzero_si256();
  255|       |
  256|   764k|  const __m256i round_const_x = _mm256_set1_epi32(
  257|   764k|      ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
  ------------------
  |  |   21|   764k|#define FILTER_BITS 7
  ------------------
  258|   764k|  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
  259|       |
  260|   764k|  const __m256i round_const_y = _mm256_set1_epi32(
  261|   764k|      ((1 << conv_params->round_1) >> 1) -
  262|   764k|      (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
  ------------------
  |  |   21|   764k|#define FILTER_BITS 7
  ------------------
  263|   764k|  const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
  264|       |
  265|   764k|  const int offset_0 =
  266|   764k|      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|   764k|#define FILTER_BITS 7
  ------------------
  267|   764k|  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
  268|   764k|  const __m256i offset_const = _mm256_set1_epi32(offset);
  269|   764k|  const int rounding_shift =
  270|   764k|      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|   764k|#define FILTER_BITS 7
  ------------------
  271|   764k|  const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
  272|       |
  273|   764k|  const __m256i clip_pixel_to_bd =
  274|   764k|      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
  ------------------
  |  Branch (274:25): [True: 762k, False: 1.55k]
  |  Branch (274:44): [True: 1.55k, False: 1]
  ------------------
  275|       |
  276|   764k|  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
  277|   764k|  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
  278|       |
  279|  2.06M|  for (j = 0; j < w; j += 8) {
  ------------------
  |  Branch (279:15): [True: 1.30M, False: 764k]
  ------------------
  280|       |    /* Horizontal filter */
  281|  1.30M|    {
  282|  18.9M|      for (i = 0; i < im_h; i += 2) {
  ------------------
  |  Branch (282:19): [True: 17.6M, False: 1.30M]
  ------------------
  283|  17.6M|        const __m256i row0 =
  284|  17.6M|            _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
  285|  17.6M|        __m256i row1 = _mm256_setzero_si256();
  286|  17.6M|        if (i + 1 < im_h)
  ------------------
  |  Branch (286:13): [True: 16.3M, False: 1.33M]
  ------------------
  287|  16.3M|          row1 =
  288|  16.3M|              _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
  289|       |
  290|  17.6M|        const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
  291|  17.6M|        const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
  292|       |
  293|       |        // even pixels
  294|  17.6M|        s[0] = _mm256_alignr_epi8(r1, r0, 0);
  295|  17.6M|        s[1] = _mm256_alignr_epi8(r1, r0, 4);
  296|  17.6M|        s[2] = _mm256_alignr_epi8(r1, r0, 8);
  297|  17.6M|        s[3] = _mm256_alignr_epi8(r1, r0, 12);
  298|       |
  299|  17.6M|        __m256i res_even = convolve(s, coeffs_x);
  300|  17.6M|        res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
  301|  17.6M|                                    round_shift_x);
  302|       |
  303|       |        // odd pixels
  304|  17.6M|        s[0] = _mm256_alignr_epi8(r1, r0, 2);
  305|  17.6M|        s[1] = _mm256_alignr_epi8(r1, r0, 6);
  306|  17.6M|        s[2] = _mm256_alignr_epi8(r1, r0, 10);
  307|  17.6M|        s[3] = _mm256_alignr_epi8(r1, r0, 14);
  308|       |
  309|  17.6M|        __m256i res_odd = convolve(s, coeffs_x);
  310|  17.6M|        res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
  311|  17.6M|                                   round_shift_x);
  312|       |
  313|  17.6M|        __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
  314|  17.6M|        __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
  315|  17.6M|        __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
  316|       |
  317|  17.6M|        _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
  318|  17.6M|      }
  319|  1.30M|    }
  320|       |
  321|       |    /* Vertical filter */
  322|  1.30M|    {
  323|  1.30M|      __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
  324|  1.30M|      __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
  325|  1.30M|      __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
  326|  1.30M|      __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
  327|  1.30M|      __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
  328|  1.30M|      __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
  329|       |
  330|  1.30M|      s[0] = _mm256_unpacklo_epi16(s0, s1);
  331|  1.30M|      s[1] = _mm256_unpacklo_epi16(s2, s3);
  332|  1.30M|      s[2] = _mm256_unpacklo_epi16(s4, s5);
  333|       |
  334|  1.30M|      s[4] = _mm256_unpackhi_epi16(s0, s1);
  335|  1.30M|      s[5] = _mm256_unpackhi_epi16(s2, s3);
  336|  1.30M|      s[6] = _mm256_unpackhi_epi16(s4, s5);
  337|       |
  338|  12.3M|      for (i = 0; i < h; i += 2) {
  ------------------
  |  Branch (338:19): [True: 11.0M, False: 1.30M]
  ------------------
  339|  11.0M|        const int16_t *data = &im_block[i * im_stride];
  340|       |
  341|  11.0M|        const __m256i s6 =
  342|  11.0M|            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
  343|  11.0M|        const __m256i s7 =
  344|  11.0M|            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
  345|       |
  346|  11.0M|        s[3] = _mm256_unpacklo_epi16(s6, s7);
  347|  11.0M|        s[7] = _mm256_unpackhi_epi16(s6, s7);
  348|       |
  349|  11.0M|        const __m256i res_a = convolve(s, coeffs_y);
  350|       |
  351|  11.0M|        const __m256i res_a_round = _mm256_sra_epi32(
  352|  11.0M|            _mm256_add_epi32(res_a, round_const_y), round_shift_y);
  353|       |
  354|  11.0M|        const __m256i res_unsigned_lo =
  355|  11.0M|            _mm256_add_epi32(res_a_round, offset_const);
  356|       |
  357|  11.0M|        if (w - j < 8) {
  ------------------
  |  Branch (357:13): [True: 767k, False: 10.3M]
  ------------------
  358|   767k|          if (do_average) {
  ------------------
  |  Branch (358:15): [True: 279k, False: 487k]
  ------------------
  359|   279k|            const __m256i data_0 = _mm256_castsi128_si256(
  360|   279k|                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
  361|   279k|            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
  362|   279k|                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
  363|   279k|            const __m256i data_01 =
  364|   279k|                _mm256_permute2x128_si256(data_0, data_1, 0x20);
  365|       |
  366|   279k|            const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
  367|       |
  368|   279k|            const __m256i comp_avg_res =
  369|   279k|                highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
  370|   279k|                                use_dist_wtd_comp_avg);
  371|       |
  372|   279k|            const __m256i round_result = highbd_convolve_rounding(
  373|   279k|                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
  374|       |
  375|   279k|            const __m256i res_16b =
  376|   279k|                _mm256_packus_epi32(round_result, round_result);
  377|   279k|            const __m256i res_clip =
  378|   279k|                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
  379|       |
  380|   279k|            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
  381|   279k|            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
  382|       |
  383|   279k|            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  384|   279k|            _mm_storel_epi64(
  385|   279k|                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
  386|   487k|          } else {
  387|   487k|            __m256i res_16b =
  388|   487k|                _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
  389|   487k|            const __m128i res_0 = _mm256_castsi256_si128(res_16b);
  390|   487k|            const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
  391|       |
  392|   487k|            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
  393|   487k|            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  394|   487k|                             res_1);
  395|   487k|          }
  396|  10.3M|        } else {
  397|  10.3M|          const __m256i res_b = convolve(s + 4, coeffs_y);
  398|  10.3M|          const __m256i res_b_round = _mm256_sra_epi32(
  399|  10.3M|              _mm256_add_epi32(res_b, round_const_y), round_shift_y);
  400|       |
  401|  10.3M|          __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const);
  402|       |
  403|  10.3M|          if (do_average) {
  ------------------
  |  Branch (403:15): [True: 3.60M, False: 6.72M]
  ------------------
  404|  3.60M|            const __m256i data_0 = _mm256_castsi128_si256(
  405|  3.60M|                _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
  406|  3.60M|            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
  407|  3.60M|                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
  408|  3.60M|            const __m256i data_01 =
  409|  3.60M|                _mm256_permute2x128_si256(data_0, data_1, 0x20);
  410|       |
  411|  3.60M|            const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
  412|  3.60M|            const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
  413|       |
  414|  3.60M|            const __m256i comp_avg_res_lo =
  415|  3.60M|                highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
  416|  3.60M|                                use_dist_wtd_comp_avg);
  417|  3.60M|            const __m256i comp_avg_res_hi =
  418|  3.60M|                highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
  419|  3.60M|                                use_dist_wtd_comp_avg);
  420|       |
  421|  3.60M|            const __m256i round_result_lo =
  422|  3.60M|                highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
  423|  3.60M|                                         &rounding_const, rounding_shift);
  424|  3.60M|            const __m256i round_result_hi =
  425|  3.60M|                highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
  426|  3.60M|                                         &rounding_const, rounding_shift);
  427|       |
  428|  3.60M|            const __m256i res_16b =
  429|  3.60M|                _mm256_packus_epi32(round_result_lo, round_result_hi);
  430|  3.60M|            const __m256i res_clip =
  431|  3.60M|                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
  432|       |
  433|  3.60M|            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
  434|  3.60M|            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
  435|       |
  436|  3.60M|            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  437|  3.60M|            _mm_store_si128(
  438|  3.60M|                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
  439|  6.72M|          } else {
  440|  6.72M|            __m256i res_16b =
  441|  6.72M|                _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
  442|  6.72M|            const __m128i res_0 = _mm256_castsi256_si128(res_16b);
  443|  6.72M|            const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
  444|       |
  445|  6.72M|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
  446|  6.72M|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  447|  6.72M|                            res_1);
  448|  6.72M|          }
  449|  10.3M|        }
  450|       |
  451|  11.0M|        s[0] = s[1];
  452|  11.0M|        s[1] = s[2];
  453|  11.0M|        s[2] = s[3];
  454|       |
  455|  11.0M|        s[4] = s[5];
  456|  11.0M|        s[5] = s[6];
  457|  11.0M|        s[6] = s[7];
  458|  11.0M|      }
  459|  1.30M|    }
  460|  1.30M|  }
  461|   764k|}
av1_highbd_dist_wtd_convolve_x_avx2:
  466|   282k|    ConvolveParams *conv_params, int bd) {
  467|   282k|  CONV_BUF_TYPE *dst = conv_params->dst;
  468|   282k|  int dst_stride = conv_params->dst_stride;
  469|   282k|  const int fo_horiz = filter_params_x->taps / 2 - 1;
  470|   282k|  const uint16_t *const src_ptr = src - fo_horiz;
  471|   282k|  const int bits = FILTER_BITS - conv_params->round_1;
  ------------------
  |  |   21|   282k|#define FILTER_BITS 7
  ------------------
  472|       |
  473|   282k|  int i, j;
  474|   282k|  __m256i s[4], coeffs_x[4];
  475|       |
  476|   282k|  const int do_average = conv_params->do_average;
  477|   282k|  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
  478|   282k|  const int w0 = conv_params->fwd_offset;
  479|   282k|  const int w1 = conv_params->bck_offset;
  480|   282k|  const __m256i wt0 = _mm256_set1_epi32(w0);
  481|   282k|  const __m256i wt1 = _mm256_set1_epi32(w1);
  482|   282k|  const __m256i zero = _mm256_setzero_si256();
  483|       |
  484|   282k|  const __m256i round_const_x =
  485|   282k|      _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
  486|   282k|  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
  487|   282k|  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
  488|       |
  489|   282k|  const int offset_0 =
  490|   282k|      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|   282k|#define FILTER_BITS 7
  ------------------
  491|   282k|  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
  492|   282k|  const __m256i offset_const = _mm256_set1_epi32(offset);
  493|   282k|  const int rounding_shift =
  494|   282k|      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|   282k|#define FILTER_BITS 7
  ------------------
  495|   282k|  const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
  496|   282k|  const __m256i clip_pixel_to_bd =
  497|   282k|      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
  ------------------
  |  Branch (497:25): [True: 281k, False: 720]
  |  Branch (497:44): [True: 718, False: 2]
  ------------------
  498|       |
  499|   282k|  assert(bits >= 0);
  500|   282k|  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
  501|       |
  502|   986k|  for (j = 0; j < w; j += 8) {
  ------------------
  |  Branch (502:15): [True: 704k, False: 282k]
  ------------------
  503|       |    /* Horizontal filter */
  504|  14.0M|    for (i = 0; i < h; i += 2) {
  ------------------
  |  Branch (504:17): [True: 13.3M, False: 704k]
  ------------------
  505|  13.3M|      const __m256i row0 =
  506|  13.3M|          _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
  507|  13.3M|      __m256i row1 =
  508|  13.3M|          _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
  509|       |
  510|  13.3M|      const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
  511|  13.3M|      const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
  512|       |
  513|       |      // even pixels
  514|  13.3M|      s[0] = _mm256_alignr_epi8(r1, r0, 0);
  515|  13.3M|      s[1] = _mm256_alignr_epi8(r1, r0, 4);
  516|  13.3M|      s[2] = _mm256_alignr_epi8(r1, r0, 8);
  517|  13.3M|      s[3] = _mm256_alignr_epi8(r1, r0, 12);
  518|       |
  519|  13.3M|      __m256i res_even = convolve(s, coeffs_x);
  520|  13.3M|      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
  521|  13.3M|                                  round_shift_x);
  522|       |
  523|       |      // odd pixels
  524|  13.3M|      s[0] = _mm256_alignr_epi8(r1, r0, 2);
  525|  13.3M|      s[1] = _mm256_alignr_epi8(r1, r0, 6);
  526|  13.3M|      s[2] = _mm256_alignr_epi8(r1, r0, 10);
  527|  13.3M|      s[3] = _mm256_alignr_epi8(r1, r0, 14);
  528|       |
  529|  13.3M|      __m256i res_odd = convolve(s, coeffs_x);
  530|  13.3M|      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
  531|  13.3M|                                 round_shift_x);
  532|       |
  533|  13.3M|      res_even = _mm256_sll_epi32(res_even, round_shift_bits);
  534|  13.3M|      res_odd = _mm256_sll_epi32(res_odd, round_shift_bits);
  535|       |
  536|  13.3M|      __m256i res1 = _mm256_unpacklo_epi32(res_even, res_odd);
  537|       |
  538|  13.3M|      __m256i res_unsigned_lo = _mm256_add_epi32(res1, offset_const);
  539|       |
  540|  13.3M|      if (w - j < 8) {
  ------------------
  |  Branch (540:11): [True: 220k, False: 13.1M]
  ------------------
  541|   220k|        if (do_average) {
  ------------------
  |  Branch (541:13): [True: 46.3k, False: 174k]
  ------------------
  542|  46.3k|          const __m256i data_0 = _mm256_castsi128_si256(
  543|  46.3k|              _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
  544|  46.3k|          const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
  545|  46.3k|              (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
  546|  46.3k|          const __m256i data_01 =
  547|  46.3k|              _mm256_permute2x128_si256(data_0, data_1, 0x20);
  548|       |
  549|  46.3k|          const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
  550|       |
  551|  46.3k|          const __m256i comp_avg_res = highbd_comp_avg(
  552|  46.3k|              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg);
  553|       |
  554|  46.3k|          const __m256i round_result = highbd_convolve_rounding(
  555|  46.3k|              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
  556|       |
  557|  46.3k|          const __m256i res_16b =
  558|  46.3k|              _mm256_packus_epi32(round_result, round_result);
  559|  46.3k|          const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
  560|       |
  561|  46.3k|          const __m128i res_0 = _mm256_castsi256_si128(res_clip);
  562|  46.3k|          const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
  563|       |
  564|  46.3k|          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  565|  46.3k|          _mm_storel_epi64(
  566|  46.3k|              (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
  567|   174k|        } else {
  568|   174k|          __m256i res_16b =
  569|   174k|              _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
  570|   174k|          const __m128i res_0 = _mm256_castsi256_si128(res_16b);
  571|   174k|          const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
  572|       |
  573|   174k|          _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
  574|   174k|          _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  575|   174k|                           res_1);
  576|   174k|        }
  577|  13.1M|      } else {
  578|  13.1M|        __m256i res2 = _mm256_unpackhi_epi32(res_even, res_odd);
  579|  13.1M|        __m256i res_unsigned_hi = _mm256_add_epi32(res2, offset_const);
  580|       |
  581|  13.1M|        if (do_average) {
  ------------------
  |  Branch (581:13): [True: 5.26M, False: 7.90M]
  ------------------
  582|  5.26M|          const __m256i data_0 = _mm256_castsi128_si256(
  583|  5.26M|              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
  584|  5.26M|          const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
  585|  5.26M|              (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
  586|  5.26M|          const __m256i data_01 =
  587|  5.26M|              _mm256_permute2x128_si256(data_0, data_1, 0x20);
  588|       |
  589|  5.26M|          const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
  590|  5.26M|          const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
  591|       |
  592|  5.26M|          const __m256i comp_avg_res_lo =
  593|  5.26M|              highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
  594|  5.26M|                              use_dist_wtd_comp_avg);
  595|  5.26M|          const __m256i comp_avg_res_hi =
  596|  5.26M|              highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
  597|  5.26M|                              use_dist_wtd_comp_avg);
  598|       |
  599|  5.26M|          const __m256i round_result_lo = highbd_convolve_rounding(
  600|  5.26M|              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
  601|  5.26M|          const __m256i round_result_hi = highbd_convolve_rounding(
  602|  5.26M|              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
  603|       |
  604|  5.26M|          const __m256i res_16b =
  605|  5.26M|              _mm256_packus_epi32(round_result_lo, round_result_hi);
  606|  5.26M|          const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
  607|       |
  608|  5.26M|          const __m128i res_0 = _mm256_castsi256_si128(res_clip);
  609|  5.26M|          const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
  610|       |
  611|  5.26M|          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  612|  5.26M|          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
  613|  5.26M|                          res_1);
  614|  7.90M|        } else {
  615|  7.90M|          __m256i res_16b =
  616|  7.90M|              _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
  617|  7.90M|          const __m128i res_0 = _mm256_castsi256_si128(res_16b);
  618|  7.90M|          const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
  619|       |
  620|  7.90M|          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
  621|  7.90M|          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  622|  7.90M|                          res_1);
  623|  7.90M|        }
  624|  13.1M|      }
  625|  13.3M|    }
  626|   704k|  }
  627|   282k|}
av1_highbd_dist_wtd_convolve_y_avx2:
  632|   154k|    ConvolveParams *conv_params, int bd) {
  633|   154k|  CONV_BUF_TYPE *dst = conv_params->dst;
  634|   154k|  int dst_stride = conv_params->dst_stride;
  635|   154k|  const int fo_vert = filter_params_y->taps / 2 - 1;
  636|   154k|  const uint16_t *const src_ptr = src - fo_vert * src_stride;
  637|   154k|  const int bits = FILTER_BITS - conv_params->round_0;
  ------------------
  |  |   21|   154k|#define FILTER_BITS 7
  ------------------
  638|       |
  639|   154k|  assert(bits >= 0);
  640|   154k|  int i, j;
  641|   154k|  __m256i s[8], coeffs_y[4];
  642|   154k|  const int do_average = conv_params->do_average;
  643|   154k|  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
  644|       |
  645|   154k|  const int w0 = conv_params->fwd_offset;
  646|   154k|  const int w1 = conv_params->bck_offset;
  647|   154k|  const __m256i wt0 = _mm256_set1_epi32(w0);
  648|   154k|  const __m256i wt1 = _mm256_set1_epi32(w1);
  649|   154k|  const __m256i round_const_y =
  650|   154k|      _mm256_set1_epi32(((1 << conv_params->round_1) >> 1));
  651|   154k|  const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
  652|   154k|  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
  653|       |
  654|   154k|  const int offset_0 =
  655|   154k|      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|   154k|#define FILTER_BITS 7
  ------------------
  656|   154k|  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
  657|   154k|  const __m256i offset_const = _mm256_set1_epi32(offset);
  658|   154k|  const int rounding_shift =
  659|   154k|      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|   154k|#define FILTER_BITS 7
  ------------------
  660|   154k|  const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
  661|   154k|  const __m256i clip_pixel_to_bd =
  662|   154k|      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
  ------------------
  |  Branch (662:25): [True: 153k, False: 837]
  |  Branch (662:44): [True: 836, False: 1]
  ------------------
  663|   154k|  const __m256i zero = _mm256_setzero_si256();
  664|       |
  665|   154k|  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
  666|       |
  667|   462k|  for (j = 0; j < w; j += 8) {
  ------------------
  |  Branch (667:15): [True: 307k, False: 154k]
  ------------------
  668|   307k|    const uint16_t *data = &src_ptr[j];
  669|       |    /* Vertical filter */
  670|   307k|    {
  671|   307k|      __m256i src6;
  672|   307k|      __m256i s01 = _mm256_permute2x128_si256(
  673|   307k|          _mm256_castsi128_si256(
  674|   307k|              _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
  675|   307k|          _mm256_castsi128_si256(
  676|   307k|              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
  677|   307k|          0x20);
  678|   307k|      __m256i s12 = _mm256_permute2x128_si256(
  679|   307k|          _mm256_castsi128_si256(
  680|   307k|              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
  681|   307k|          _mm256_castsi128_si256(
  682|   307k|              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
  683|   307k|          0x20);
  684|   307k|      __m256i s23 = _mm256_permute2x128_si256(
  685|   307k|          _mm256_castsi128_si256(
  686|   307k|              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
  687|   307k|          _mm256_castsi128_si256(
  688|   307k|              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
  689|   307k|          0x20);
  690|   307k|      __m256i s34 = _mm256_permute2x128_si256(
  691|   307k|          _mm256_castsi128_si256(
  692|   307k|              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
  693|   307k|          _mm256_castsi128_si256(
  694|   307k|              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
  695|   307k|          0x20);
  696|   307k|      __m256i s45 = _mm256_permute2x128_si256(
  697|   307k|          _mm256_castsi128_si256(
  698|   307k|              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
  699|   307k|          _mm256_castsi128_si256(
  700|   307k|              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
  701|   307k|          0x20);
  702|   307k|      src6 = _mm256_castsi128_si256(
  703|   307k|          _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
  704|   307k|      __m256i s56 = _mm256_permute2x128_si256(
  705|   307k|          _mm256_castsi128_si256(
  706|   307k|              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
  707|   307k|          src6, 0x20);
  708|       |
  709|   307k|      s[0] = _mm256_unpacklo_epi16(s01, s12);
  710|   307k|      s[1] = _mm256_unpacklo_epi16(s23, s34);
  711|   307k|      s[2] = _mm256_unpacklo_epi16(s45, s56);
  712|       |
  713|   307k|      s[4] = _mm256_unpackhi_epi16(s01, s12);
  714|   307k|      s[5] = _mm256_unpackhi_epi16(s23, s34);
  715|   307k|      s[6] = _mm256_unpackhi_epi16(s45, s56);
  716|       |
  717|  3.65M|      for (i = 0; i < h; i += 2) {
  ------------------
  |  Branch (717:19): [True: 3.35M, False: 307k]
  ------------------
  718|  3.35M|        data = &src_ptr[i * src_stride + j];
  719|       |
  720|  3.35M|        const __m256i s67 = _mm256_permute2x128_si256(
  721|  3.35M|            src6,
  722|  3.35M|            _mm256_castsi128_si256(
  723|  3.35M|                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
  724|  3.35M|            0x20);
  725|       |
  726|  3.35M|        src6 = _mm256_castsi128_si256(
  727|  3.35M|            _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
  728|       |
  729|  3.35M|        const __m256i s78 = _mm256_permute2x128_si256(
  730|  3.35M|            _mm256_castsi128_si256(
  731|  3.35M|                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
  732|  3.35M|            src6, 0x20);
  733|       |
  734|  3.35M|        s[3] = _mm256_unpacklo_epi16(s67, s78);
  735|  3.35M|        s[7] = _mm256_unpackhi_epi16(s67, s78);
  736|       |
  737|  3.35M|        const __m256i res_a = convolve(s, coeffs_y);
  738|       |
  739|  3.35M|        __m256i res_a_round = _mm256_sll_epi32(res_a, round_shift_bits);
  740|  3.35M|        res_a_round = _mm256_sra_epi32(
  741|  3.35M|            _mm256_add_epi32(res_a_round, round_const_y), round_shift_y);
  742|       |
  743|  3.35M|        __m256i res_unsigned_lo = _mm256_add_epi32(res_a_round, offset_const);
  744|       |
  745|  3.35M|        if (w - j < 8) {
  ------------------
  |  Branch (745:13): [True: 129k, False: 3.22M]
  ------------------
  746|   129k|          if (do_average) {
  ------------------
  |  Branch (746:15): [True: 70.8k, False: 58.7k]
  ------------------
  747|  70.8k|            const __m256i data_0 = _mm256_castsi128_si256(
  748|  70.8k|                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
  749|  70.8k|            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
  750|  70.8k|                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
  751|  70.8k|            const __m256i data_01 =
  752|  70.8k|                _mm256_permute2x128_si256(data_0, data_1, 0x20);
  753|       |
  754|  70.8k|            const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
  755|       |
  756|  70.8k|            const __m256i comp_avg_res =
  757|  70.8k|                highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
  758|  70.8k|                                use_dist_wtd_comp_avg);
  759|       |
  760|  70.8k|            const __m256i round_result = highbd_convolve_rounding(
  761|  70.8k|                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
  762|       |
  763|  70.8k|            const __m256i res_16b =
  764|  70.8k|                _mm256_packus_epi32(round_result, round_result);
  765|  70.8k|            const __m256i res_clip =
  766|  70.8k|                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
  767|       |
  768|  70.8k|            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
  769|  70.8k|            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
  770|       |
  771|  70.8k|            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  772|  70.8k|            _mm_storel_epi64(
  773|  70.8k|                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
  774|  70.8k|          } else {
  775|  58.7k|            __m256i res_16b =
  776|  58.7k|                _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
  777|  58.7k|            const __m128i res_0 = _mm256_castsi256_si128(res_16b);
  778|  58.7k|            const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
  779|       |
  780|  58.7k|            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
  781|  58.7k|            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  782|  58.7k|                             res_1);
  783|  58.7k|          }
  784|  3.22M|        } else {
  785|  3.22M|          const __m256i res_b = convolve(s + 4, coeffs_y);
  786|  3.22M|          __m256i res_b_round = _mm256_sll_epi32(res_b, round_shift_bits);
  787|  3.22M|          res_b_round = _mm256_sra_epi32(
  788|  3.22M|              _mm256_add_epi32(res_b_round, round_const_y), round_shift_y);
  789|       |
  790|  3.22M|          __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const);
  791|       |
  792|  3.22M|          if (do_average) {
  ------------------
  |  Branch (792:15): [True: 1.05M, False: 2.16M]
  ------------------
  793|  1.05M|            const __m256i data_0 = _mm256_castsi128_si256(
  794|  1.05M|                _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
  795|  1.05M|            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
  796|  1.05M|                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
  797|  1.05M|            const __m256i data_01 =
  798|  1.05M|                _mm256_permute2x128_si256(data_0, data_1, 0x20);
  799|       |
  800|  1.05M|            const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
  801|  1.05M|            const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
  802|       |
  803|  1.05M|            const __m256i comp_avg_res_lo =
  804|  1.05M|                highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
  805|  1.05M|                                use_dist_wtd_comp_avg);
  806|  1.05M|            const __m256i comp_avg_res_hi =
  807|  1.05M|                highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
  808|  1.05M|                                use_dist_wtd_comp_avg);
  809|       |
  810|  1.05M|            const __m256i round_result_lo =
  811|  1.05M|                highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
  812|  1.05M|                                         &rounding_const, rounding_shift);
  813|  1.05M|            const __m256i round_result_hi =
  814|  1.05M|                highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
  815|  1.05M|                                         &rounding_const, rounding_shift);
  816|       |
  817|  1.05M|            const __m256i res_16b =
  818|  1.05M|                _mm256_packus_epi32(round_result_lo, round_result_hi);
  819|  1.05M|            const __m256i res_clip =
  820|  1.05M|                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
  821|       |
  822|  1.05M|            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
  823|  1.05M|            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
  824|       |
  825|  1.05M|            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  826|  1.05M|            _mm_store_si128(
  827|  1.05M|                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
  828|  2.16M|          } else {
  829|  2.16M|            __m256i res_16b =
  830|  2.16M|                _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
  831|  2.16M|            const __m128i res_0 = _mm256_castsi256_si128(res_16b);
  832|  2.16M|            const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
  833|       |
  834|  2.16M|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
  835|  2.16M|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  836|  2.16M|                            res_1);
  837|  2.16M|          }
  838|  3.22M|        }
  839|  3.35M|        s[0] = s[1];
  840|  3.35M|        s[1] = s[2];
  841|  3.35M|        s[2] = s[3];
  842|       |
  843|  3.35M|        s[4] = s[5];
  844|  3.35M|        s[5] = s[6];
  845|  3.35M|        s[6] = s[7];
  846|  3.35M|      }
  847|   307k|    }
  848|   307k|  }
  849|   154k|}

highbd_inv_txfm_sse4.c:half_btf_0_sse4_1:
  112|   498k|                                        const __m128i *rounding, int bit) {
  113|   498k|  __m128i x;
  114|       |
  115|   498k|  x = _mm_mullo_epi32(*w0, *n0);
  116|   498k|  x = _mm_add_epi32(x, *rounding);
  117|   498k|  x = _mm_srai_epi32(x, bit);
  118|   498k|  return x;
  119|   498k|}
highbd_inv_txfm_sse4.c:half_btf_sse4_1:
  100|  12.1M|                                      const __m128i *rounding, int bit) {
  101|  12.1M|  __m128i x, y;
  102|       |
  103|  12.1M|  x = _mm_mullo_epi32(*w0, *n0);
  104|  12.1M|  y = _mm_mullo_epi32(*w1, *n1);
  105|  12.1M|  x = _mm_add_epi32(x, y);
  106|  12.1M|  x = _mm_add_epi32(x, *rounding);
  107|  12.1M|  x = _mm_srai_epi32(x, bit);
  108|  12.1M|  return x;
  109|  12.1M|}

av1_highbd_warp_affine_avx2:
   23|   264k|                                 int16_t beta, int16_t gamma, int16_t delta) {
   24|   264k|  __m256i tmp[15];
   25|   264k|  const int reduce_bits_horiz = conv_params->round_0;
   26|   264k|  const int reduce_bits_vert = conv_params->is_compound
  ------------------
  |  Branch (26:32): [True: 22.8k, False: 241k]
  ------------------
   27|   264k|                                   ? conv_params->round_1
   28|   264k|                                   : 2 * FILTER_BITS - reduce_bits_horiz;
  ------------------
  |  |   21|   241k|#define FILTER_BITS 7
  ------------------
   29|   264k|  const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
  ------------------
  |  |   21|   264k|#define FILTER_BITS 7
  ------------------
   30|   264k|  const int offset_bits_horiz = bd + FILTER_BITS - 1;
  ------------------
  |  |   21|   264k|#define FILTER_BITS 7
  ------------------
   31|   264k|  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
  ------------------
  |  |   21|   264k|#define FILTER_BITS 7
  ------------------
   32|   264k|  const int round_bits =
   33|   264k|      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|   264k|#define FILTER_BITS 7
  ------------------
   34|   264k|  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
  ------------------
  |  |   21|   264k|#define FILTER_BITS 7
  ------------------
   35|   264k|  (void)max_bits_horiz;
   36|   264k|  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
   37|       |
   38|       |  // Check that, even with 12-bit input, the intermediate values will fit
   39|       |  // into an unsigned 16-bit intermediate array.
   40|   264k|  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
   41|       |
   42|   264k|  const __m256i clip_pixel =
   43|   264k|      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
  ------------------
  |  Branch (43:25): [True: 260k, False: 3.84k]
  |  Branch (43:44): [True: 3.84k, False: 0]
  ------------------
   44|   264k|  const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
   45|   264k|  const __m256i reduce_bits_vert_const =
   46|   264k|      _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1));
   47|   264k|  const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert);
   48|   264k|  const __m256i res_sub_const =
   49|   264k|      _mm256_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) -
   50|   264k|                        (1 << (offset_bits - conv_params->round_1 - 1)));
   51|   264k|  __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
   52|   264k|  __m256i round_bits_const = _mm256_set1_epi32(((1 << round_bits) >> 1));
   53|       |
   54|   264k|  const int w0 = conv_params->fwd_offset;
   55|   264k|  const int w1 = conv_params->bck_offset;
   56|   264k|  const __m256i wt0 = _mm256_set1_epi32(w0);
   57|   264k|  const __m256i wt1 = _mm256_set1_epi32(w1);
   58|       |
   59|   264k|  __m256i v_rbhoriz = _mm256_set1_epi32(1 << (reduce_bits_horiz - 1));
   60|   264k|  __m256i v_zeros = _mm256_setzero_si256();
   61|   264k|  int ohoriz = 1 << offset_bits_horiz;
   62|   264k|  int mhoriz = 1 << max_bits_horiz;
   63|   264k|  (void)mhoriz;
   64|   264k|  int sx;
   65|       |
   66|   821k|  for (int i = 0; i < p_height; i += 8) {
  ------------------
  |  Branch (66:19): [True: 557k, False: 264k]
  ------------------
   67|  2.60M|    for (int j = 0; j < p_width; j += 8) {
  ------------------
  |  Branch (67:21): [True: 2.04M, False: 557k]
  ------------------
   68|       |      // Calculate the center of this 8x8 block,
   69|       |      // project to luma coordinates (if in a subsampled chroma plane),
   70|       |      // apply the affine transformation,
   71|       |      // then convert back to the original coordinates (if necessary)
   72|  2.04M|      const int32_t src_x = (p_col + j + 4) << subsampling_x;
   73|  2.04M|      const int32_t src_y = (p_row + i + 4) << subsampling_y;
   74|  2.04M|      const int64_t dst_x =
   75|  2.04M|          (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
   76|  2.04M|      const int64_t dst_y =
   77|  2.04M|          (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
   78|  2.04M|      const int64_t x4 = dst_x >> subsampling_x;
   79|  2.04M|      const int64_t y4 = dst_y >> subsampling_y;
   80|       |
   81|  2.04M|      const int16_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
  ------------------
  |  |   96|  2.04M|#define WARPEDMODEL_PREC_BITS 16
  ------------------
   82|  2.04M|      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
  ------------------
  |  |   96|  2.04M|#define WARPEDMODEL_PREC_BITS 16
  ------------------
   83|  2.04M|      const int16_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
  ------------------
  |  |   96|  2.04M|#define WARPEDMODEL_PREC_BITS 16
  ------------------
   84|  2.04M|      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
  ------------------
  |  |   96|  2.04M|#define WARPEDMODEL_PREC_BITS 16
  ------------------
   85|       |
   86|  2.04M|      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
  ------------------
  |  |  107|  2.04M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  2.04M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  2.04M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
   87|  2.04M|             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
  ------------------
  |  |  103|  2.04M|#define WARPEDPIXEL_PREC_SHIFTS (1 << WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  2.04M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
                           (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
  ------------------
  |  |  107|  2.04M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  2.04M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  2.04M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
   88|  2.04M|      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
  ------------------
  |  |  107|  2.04M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  2.04M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  2.04M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
   89|  2.04M|             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
  ------------------
  |  |  103|  2.04M|#define WARPEDPIXEL_PREC_SHIFTS (1 << WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  2.04M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
                           (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
  ------------------
  |  |  107|  2.04M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  2.04M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  2.04M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
   90|       |
   91|  2.04M|      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
  ------------------
  |  |  105|  2.04M|#define WARP_PARAM_REDUCE_BITS 6
  ------------------
   92|  2.04M|      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
  ------------------
  |  |  105|  2.04M|#define WARP_PARAM_REDUCE_BITS 6
  ------------------
   93|       |
   94|       |      // Horizontal filter
   95|  2.04M|      if (ix4 <= -7) {
  ------------------
  |  Branch (95:11): [True: 96.0k, False: 1.94M]
  ------------------
   96|  1.53M|        for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
  ------------------
  |  |   34|  1.53M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.37M, False: 158k]
  |  |  ------------------
  ------------------
  |  Branch (96:26): [True: 1.44M, False: 96.0k]
  ------------------
   97|  1.44M|          int iy = iy4 + k;
   98|  1.44M|          if (iy < 0)
  ------------------
  |  Branch (98:15): [True: 3.15k, False: 1.43M]
  ------------------
   99|  3.15k|            iy = 0;
  100|  1.43M|          else if (iy > height - 1)
  ------------------
  |  Branch (100:20): [True: 24.2k, False: 1.41M]
  ------------------
  101|  24.2k|            iy = height - 1;
  102|  1.44M|          tmp[k + 7] = _mm256_cvtepi16_epi32(_mm_set1_epi16(
  103|  1.44M|              (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
  ------------------
  |  |   21|  1.44M|#define FILTER_BITS 7
  ------------------
  104|  1.44M|              ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz))));
  ------------------
  |  |   21|  1.44M|#define FILTER_BITS 7
  ------------------
  105|  1.44M|        }
  106|  1.94M|      } else if (ix4 >= width + 6) {
  ------------------
  |  Branch (106:18): [True: 179k, False: 1.76M]
  ------------------
  107|  2.86M|        for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
  ------------------
  |  |   34|  2.86M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 2.59M, False: 270k]
  |  |  ------------------
  ------------------
  |  Branch (107:26): [True: 2.68M, False: 179k]
  ------------------
  108|  2.68M|          int iy = iy4 + k;
  109|  2.68M|          if (iy < 0)
  ------------------
  |  Branch (109:15): [True: 49.4k, False: 2.63M]
  ------------------
  110|  49.4k|            iy = 0;
  111|  2.63M|          else if (iy > height - 1)
  ------------------
  |  Branch (111:20): [True: 25.9k, False: 2.61M]
  ------------------
  112|  25.9k|            iy = height - 1;
  113|  2.68M|          tmp[k + 7] = _mm256_cvtepi16_epi32(
  114|  2.68M|              _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
  ------------------
  |  |   21|  2.68M|#define FILTER_BITS 7
  ------------------
  115|  2.68M|                             ref[iy * stride + (width - 1)] *
  116|  2.68M|                                 (1 << (FILTER_BITS - reduce_bits_horiz))));
  ------------------
  |  |   21|  2.68M|#define FILTER_BITS 7
  ------------------
  117|  2.68M|        }
  118|  1.76M|      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
  ------------------
  |  Branch (118:18): [True: 45.3k, False: 1.72M]
  |  Branch (118:37): [True: 45.8k, False: 1.67M]
  ------------------
  119|  92.2k|        int32_t tmp1[8];
  120|  1.47M|        for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
  ------------------
  |  |   34|  1.47M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.27M, False: 202k]
  |  |  ------------------
  ------------------
  |  Branch (120:26): [True: 1.38M, False: 92.2k]
  ------------------
  121|  1.38M|          const int iy = clamp(iy4 + k, 0, height - 1);
  122|       |
  123|  1.38M|          sx = sx4 + beta * (k + 4);
  124|  12.4M|          for (int l = -4; l < 4; ++l) {
  ------------------
  |  Branch (124:28): [True: 11.0M, False: 1.38M]
  ------------------
  125|  11.0M|            int ix = ix4 + l - 3;
  126|  11.0M|            const int offs = sx >> WARPEDDIFF_PREC_BITS;
  ------------------
  |  |  107|  11.0M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  11.0M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  11.0M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  127|  11.0M|            const int16_t *coeffs = av1_warped_filter[offs];
  128|       |
  129|  11.0M|            int32_t sum = 1 << offset_bits_horiz;
  130|  99.5M|            for (int m = 0; m < 8; ++m) {
  ------------------
  |  Branch (130:29): [True: 88.4M, False: 11.0M]
  ------------------
  131|  88.4M|              const int sample_x = clamp(ix + m, 0, width - 1);
  132|  88.4M|              sum += ref[iy * stride + sample_x] * coeffs[m];
  133|  88.4M|            }
  134|  11.0M|            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
  ------------------
  |  |   41|  11.0M|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
  135|  11.0M|            tmp1[(l + 4) / 2 + ((l + 4) % 2) * 4] = sum;
  136|  11.0M|            sx += alpha;
  137|  11.0M|          }
  138|  1.38M|          tmp[k + 7] = _mm256_loadu_si256((__m256i *)tmp1);
  139|  1.38M|        }
  140|  1.67M|      } else {
  141|  1.67M|        if (beta == 0 && alpha == 0) {
  ------------------
  |  Branch (141:13): [True: 807k, False: 869k]
  |  Branch (141:26): [True: 430k, False: 377k]
  ------------------
  142|   430k|          sx = sx4;
  143|   430k|          __m128i v_01 = _mm_loadu_si128(
  144|   430k|              (__m128i *)
  145|   430k|                  av1_warped_filter[sx >>
  146|   430k|                                    WARPEDDIFF_PREC_BITS]);  // A7A6A5A4A3A2A1A0
  ------------------
  |  |  107|   430k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   430k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   430k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  147|   430k|          __m256i v_c01 = _mm256_broadcastd_epi32(v_01);     // A1A0A1A0A1A0A1A0
  148|   430k|          __m256i v_c23 = _mm256_broadcastd_epi32(
  149|   430k|              _mm_shuffle_epi32(v_01, 1));  // A3A2A3A2A3A2A3A2
  150|   430k|          __m256i v_c45 = _mm256_broadcastd_epi32(
  151|   430k|              _mm_shuffle_epi32(v_01, 2));  // A5A4A5A4A5A4A5A4
  152|   430k|          __m256i v_c67 = _mm256_broadcastd_epi32(
  153|   430k|              _mm_shuffle_epi32(v_01, 3));  // A7A6A7A6A7A6A7A6
  154|  6.61M|          for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
  ------------------
  |  |   34|  6.61M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 4.95M, False: 1.66M]
  |  |  ------------------
  ------------------
  |  Branch (154:28): [True: 6.18M, False: 430k]
  ------------------
  155|  6.18M|            int iy = iy4 + k;
  156|  6.18M|            if (iy < 0)
  ------------------
  |  Branch (156:17): [True: 146k, False: 6.04M]
  ------------------
  157|   146k|              iy = 0;
  158|  6.04M|            else if (iy > height - 1)
  ------------------
  |  Branch (158:22): [True: 247k, False: 5.79M]
  ------------------
  159|   247k|              iy = height - 1;
  160|  6.18M|            iy = iy * stride;
  161|       |
  162|  6.18M|            __m256i v_refl = _mm256_inserti128_si256(
  163|  6.18M|                _mm256_setzero_si256(),
  164|  6.18M|                _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
  165|  6.18M|            v_refl = _mm256_inserti128_si256(
  166|  6.18M|                v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
  167|  6.18M|                1);  // R15 .. R0
  168|       |
  169|  6.18M|            __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
  170|       |
  171|  6.18M|            __m256i v_refu =
  172|  6.18M|                _mm256_alignr_epi8(v_ref, v_refl, 2);  // R8R15R14...R2R1
  173|  6.18M|            v_refl = _mm256_inserti128_si256(
  174|  6.18M|                v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
  175|  6.18M|            v_refu = _mm256_inserti128_si256(
  176|  6.18M|                v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
  177|       |
  178|  6.18M|            __m256i v_sum = _mm256_set1_epi32(ohoriz);
  179|  6.18M|            __m256i parsum = _mm256_madd_epi16(
  180|  6.18M|                v_c01, _mm256_alignr_epi8(v_refu, v_refl,
  181|  6.18M|                                          0));  // R8R7R6..R1R7R6R5..R1R0
  182|  6.18M|            __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
  183|       |
  184|  6.18M|            parsum = _mm256_madd_epi16(
  185|  6.18M|                v_c23,
  186|  6.18M|                _mm256_alignr_epi8(v_refu, v_refl, 4));  // R10R9..R3R9R8..R3R2
  187|  6.18M|            __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
  188|  6.18M|            parsum = _mm256_madd_epi16(
  189|  6.18M|                v_c45, _mm256_alignr_epi8(v_refu, v_refl,
  190|  6.18M|                                          8));  // R12R11..R5R11R10..R5R4
  191|  6.18M|            __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
  192|  6.18M|            parsum = _mm256_madd_epi16(
  193|  6.18M|                v_c67, _mm256_alignr_epi8(v_refu, v_refl,
  194|  6.18M|                                          12));  // R14R13..R7R13R12..R7R6
  195|  6.18M|            __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
  196|       |
  197|  6.18M|            tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
  198|  6.18M|                                           reduce_bits_horiz);
  199|  6.18M|          }
  200|  1.24M|        } else if (alpha == 0) {
  ------------------
  |  Branch (200:20): [True: 373k, False: 873k]
  ------------------
  201|  5.95M|          for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
  ------------------
  |  |   34|  5.95M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 4.24M, False: 1.70M]
  |  |  ------------------
  ------------------
  |  Branch (201:28): [True: 5.57M, False: 373k]
  ------------------
  202|  5.57M|            int iy = iy4 + k;
  203|  5.57M|            if (iy < 0)
  ------------------
  |  Branch (203:17): [True: 222k, False: 5.35M]
  ------------------
  204|   222k|              iy = 0;
  205|  5.35M|            else if (iy > height - 1)
  ------------------
  |  Branch (205:22): [True: 177k, False: 5.17M]
  ------------------
  206|   177k|              iy = height - 1;
  207|  5.57M|            iy = iy * stride;
  208|       |
  209|  5.57M|            sx = sx4 + beta * (k + 4);
  210|       |
  211|  5.57M|            __m128i v_01 = _mm_loadu_si128(
  212|  5.57M|                (__m128i *)av1_warped_filter
  213|  5.57M|                    [sx >> WARPEDDIFF_PREC_BITS]);          // A7A6A5A4A3A2A1A0
  ------------------
  |  |  107|  5.57M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  5.57M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  5.57M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  214|  5.57M|            __m256i v_c01 = _mm256_broadcastd_epi32(v_01);  // A1A0A1A0A1A0A1A0
  215|  5.57M|            __m256i v_c23 = _mm256_broadcastd_epi32(
  216|  5.57M|                _mm_shuffle_epi32(v_01, 1));  // A3A2A3A2A3A2A3A2
  217|  5.57M|            __m256i v_c45 = _mm256_broadcastd_epi32(
  218|  5.57M|                _mm_shuffle_epi32(v_01, 2));  // A5A4A5A4A5A4A5A4
  219|  5.57M|            __m256i v_c67 = _mm256_broadcastd_epi32(
  220|  5.57M|                _mm_shuffle_epi32(v_01, 3));  // A7A6A7A6A7A6A7A6
  221|       |
  222|  5.57M|            __m256i v_refl = _mm256_inserti128_si256(
  223|  5.57M|                _mm256_setzero_si256(),
  224|  5.57M|                _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
  225|  5.57M|            v_refl = _mm256_inserti128_si256(
  226|  5.57M|                v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
  227|  5.57M|                1);  // R15 .. R0
  228|       |
  229|  5.57M|            __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
  230|       |
  231|  5.57M|            __m256i v_refu =
  232|  5.57M|                _mm256_alignr_epi8(v_ref, v_refl, 2);  // R8R15R14...R2R1
  233|       |
  234|  5.57M|            v_refl = _mm256_inserti128_si256(
  235|  5.57M|                v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
  236|  5.57M|            v_refu = _mm256_inserti128_si256(
  237|  5.57M|                v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
  238|       |
  239|  5.57M|            __m256i v_sum = _mm256_set1_epi32(ohoriz);
  240|  5.57M|            __m256i parsum =
  241|  5.57M|                _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0));
  242|  5.57M|            __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
  243|       |
  244|  5.57M|            parsum =
  245|  5.57M|                _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4));
  246|  5.57M|            __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
  247|  5.57M|            parsum =
  248|  5.57M|                _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8));
  249|  5.57M|            __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
  250|  5.57M|            parsum = _mm256_madd_epi16(v_c67,
  251|  5.57M|                                       _mm256_alignr_epi8(v_refu, v_refl, 12));
  252|  5.57M|            __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
  253|       |
  254|  5.57M|            tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
  255|  5.57M|                                           reduce_bits_horiz);
  256|  5.57M|          }
  257|   873k|        } else if (beta == 0) {
  ------------------
  |  Branch (257:20): [True: 377k, False: 496k]
  ------------------
  258|   377k|          sx = sx4;
  259|   377k|          __m256i v_coeff01 = _mm256_inserti128_si256(
  260|   377k|              v_zeros,
  261|   377k|              _mm_loadu_si128(
  262|   377k|                  (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]),
  263|   377k|              0);
  264|   377k|          v_coeff01 = _mm256_inserti128_si256(
  265|   377k|              v_coeff01,
  266|   377k|              _mm_loadu_si128(
  267|   377k|                  (__m128i *)
  268|   377k|                      av1_warped_filter[(sx + alpha) >> WARPEDDIFF_PREC_BITS]),
  269|   377k|              1);  // B7B6..B1B0A7A6..A1A0
  270|   377k|          __m256i v_coeff23 = _mm256_inserti128_si256(
  271|   377k|              v_zeros,
  272|   377k|              _mm_loadu_si128(
  273|   377k|                  (__m128i *)av1_warped_filter[(sx + 2 * alpha) >>
  274|   377k|                                               WARPEDDIFF_PREC_BITS]),
  275|   377k|              0);
  276|   377k|          v_coeff23 = _mm256_inserti128_si256(
  277|   377k|              v_coeff23,
  278|   377k|              _mm_loadu_si128(
  279|   377k|                  (__m128i *)av1_warped_filter[(sx + 3 * alpha) >>
  280|   377k|                                               WARPEDDIFF_PREC_BITS]),
  281|   377k|              1);  // D7D6..D1D0C7C6..C1C0
  282|   377k|          __m256i v_coeff45 = _mm256_inserti128_si256(
  283|   377k|              v_zeros,
  284|   377k|              _mm_loadu_si128(
  285|   377k|                  (__m128i *)av1_warped_filter[(sx + 4 * alpha) >>
  286|   377k|                                               WARPEDDIFF_PREC_BITS]),
  287|   377k|              0);
  288|   377k|          v_coeff45 = _mm256_inserti128_si256(
  289|   377k|              v_coeff45,
  290|   377k|              _mm_loadu_si128(
  291|   377k|                  (__m128i *)av1_warped_filter[(sx + 5 * alpha) >>
  292|   377k|                                               WARPEDDIFF_PREC_BITS]),
  293|   377k|              1);  // F7F6..F1F0E7E6..E1E0
  294|   377k|          __m256i v_coeff67 = _mm256_inserti128_si256(
  295|   377k|              v_zeros,
  296|   377k|              _mm_loadu_si128(
  297|   377k|                  (__m128i *)av1_warped_filter[(sx + 6 * alpha) >>
  298|   377k|                                               WARPEDDIFF_PREC_BITS]),
  299|   377k|              0);
  300|   377k|          v_coeff67 = _mm256_inserti128_si256(
  301|   377k|              v_coeff67,
  302|   377k|              _mm_loadu_si128(
  303|   377k|                  (__m128i *)av1_warped_filter[(sx + 7 * alpha) >>
  304|   377k|                                               WARPEDDIFF_PREC_BITS]),
  305|   377k|              1);  // H7H6..H1H0G7G6..G1G0
  306|       |
  307|   377k|          __m256i v_c0123 = _mm256_unpacklo_epi32(
  308|   377k|              v_coeff01,
  309|   377k|              v_coeff23);  // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0
  310|   377k|          __m256i v_c0123u = _mm256_unpackhi_epi32(
  311|   377k|              v_coeff01,
  312|   377k|              v_coeff23);  // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4
  313|   377k|          __m256i v_c4567 = _mm256_unpacklo_epi32(
  314|   377k|              v_coeff45,
  315|   377k|              v_coeff67);  // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0
  316|   377k|          __m256i v_c4567u = _mm256_unpackhi_epi32(
  317|   377k|              v_coeff45,
  318|   377k|              v_coeff67);  // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4
  319|       |
  320|   377k|          __m256i v_c01 = _mm256_unpacklo_epi64(
  321|   377k|              v_c0123, v_c4567);  // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0
  322|   377k|          __m256i v_c23 =
  323|   377k|              _mm256_unpackhi_epi64(v_c0123, v_c4567);  // H3H2 ... A3A2
  324|   377k|          __m256i v_c45 =
  325|   377k|              _mm256_unpacklo_epi64(v_c0123u, v_c4567u);  // H5H4 ... A5A4
  326|   377k|          __m256i v_c67 =
  327|   377k|              _mm256_unpackhi_epi64(v_c0123u, v_c4567u);  // H7H6 ... A7A6
  328|       |
  329|  6.02M|          for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
  ------------------
  |  |   34|  6.02M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 3.85M, False: 2.17M]
  |  |  ------------------
  ------------------
  |  Branch (329:28): [True: 5.64M, False: 377k]
  ------------------
  330|  5.64M|            int iy = iy4 + k;
  331|  5.64M|            if (iy < 0)
  ------------------
  |  Branch (331:17): [True: 85.6k, False: 5.55M]
  ------------------
  332|  85.6k|              iy = 0;
  333|  5.55M|            else if (iy > height - 1)
  ------------------
  |  Branch (333:22): [True: 55.4k, False: 5.50M]
  ------------------
  334|  55.4k|              iy = height - 1;
  335|  5.64M|            iy = iy * stride;
  336|       |
  337|  5.64M|            __m256i v_refl = _mm256_inserti128_si256(
  338|  5.64M|                _mm256_setzero_si256(),
  339|  5.64M|                _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
  340|  5.64M|            v_refl = _mm256_inserti128_si256(
  341|  5.64M|                v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
  342|  5.64M|                1);  // R15 .. R0
  343|       |
  344|  5.64M|            __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
  345|       |
  346|  5.64M|            __m256i v_refu =
  347|  5.64M|                _mm256_alignr_epi8(v_ref, v_refl, 2);  // R8R15R14...R2R1
  348|       |
  349|  5.64M|            v_refl = _mm256_inserti128_si256(
  350|  5.64M|                v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
  351|  5.64M|            v_refu = _mm256_inserti128_si256(
  352|  5.64M|                v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
  353|       |
  354|  5.64M|            __m256i v_sum = _mm256_set1_epi32(ohoriz);
  355|  5.64M|            __m256i parsum = _mm256_madd_epi16(
  356|  5.64M|                v_c01, _mm256_alignr_epi8(v_refu, v_refl,
  357|  5.64M|                                          0));  // R8R7R6..R1R7R6R5..R1R0
  358|  5.64M|            __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
  359|       |
  360|  5.64M|            parsum = _mm256_madd_epi16(
  361|  5.64M|                v_c23,
  362|  5.64M|                _mm256_alignr_epi8(v_refu, v_refl, 4));  // R10R9..R3R9R8..R3R2
  363|  5.64M|            __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
  364|  5.64M|            parsum = _mm256_madd_epi16(
  365|  5.64M|                v_c45, _mm256_alignr_epi8(v_refu, v_refl,
  366|  5.64M|                                          8));  // R12R11..R5R11R10..R5R4
  367|  5.64M|            __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
  368|  5.64M|            parsum = _mm256_madd_epi16(
  369|  5.64M|                v_c67, _mm256_alignr_epi8(v_refu, v_refl,
  370|  5.64M|                                          12));  // R14R13..R7R13R12..R7R6
  371|  5.64M|            __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
  372|       |
  373|  5.64M|            tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
  374|  5.64M|                                           reduce_bits_horiz);
  375|  5.64M|          }
  376|       |
  377|   496k|        } else {
  378|  7.92M|          for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
  ------------------
  |  |   34|  7.92M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 5.47M, False: 2.44M]
  |  |  ------------------
  ------------------
  |  Branch (378:28): [True: 7.42M, False: 496k]
  ------------------
  379|  7.42M|            int iy = iy4 + k;
  380|  7.42M|            if (iy < 0)
  ------------------
  |  Branch (380:17): [True: 306k, False: 7.11M]
  ------------------
  381|   306k|              iy = 0;
  382|  7.11M|            else if (iy > height - 1)
  ------------------
  |  Branch (382:22): [True: 319k, False: 6.79M]
  ------------------
  383|   319k|              iy = height - 1;
  384|  7.42M|            iy = iy * stride;
  385|       |
  386|  7.42M|            sx = sx4 + beta * (k + 4);
  387|       |
  388|  7.42M|            __m256i v_coeff01 = _mm256_inserti128_si256(
  389|  7.42M|                v_zeros,
  390|  7.42M|                _mm_loadu_si128(
  391|  7.42M|                    (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]),
  392|  7.42M|                0);
  393|  7.42M|            v_coeff01 = _mm256_inserti128_si256(
  394|  7.42M|                v_coeff01,
  395|  7.42M|                _mm_loadu_si128(
  396|  7.42M|                    (__m128i *)av1_warped_filter[(sx + alpha) >>
  397|  7.42M|                                                 WARPEDDIFF_PREC_BITS]),
  398|  7.42M|                1);  // B7B6..B1B0A7A6..A1A0
  399|  7.42M|            __m256i v_coeff23 = _mm256_inserti128_si256(
  400|  7.42M|                v_zeros,
  401|  7.42M|                _mm_loadu_si128(
  402|  7.42M|                    (__m128i *)av1_warped_filter[(sx + 2 * alpha) >>
  403|  7.42M|                                                 WARPEDDIFF_PREC_BITS]),
  404|  7.42M|                0);
  405|  7.42M|            v_coeff23 = _mm256_inserti128_si256(
  406|  7.42M|                v_coeff23,
  407|  7.42M|                _mm_loadu_si128(
  408|  7.42M|                    (__m128i *)av1_warped_filter[(sx + 3 * alpha) >>
  409|  7.42M|                                                 WARPEDDIFF_PREC_BITS]),
  410|  7.42M|                1);  // D7D6..D1D0C7C6..C1C0
  411|  7.42M|            __m256i v_coeff45 = _mm256_inserti128_si256(
  412|  7.42M|                v_zeros,
  413|  7.42M|                _mm_loadu_si128(
  414|  7.42M|                    (__m128i *)av1_warped_filter[(sx + 4 * alpha) >>
  415|  7.42M|                                                 WARPEDDIFF_PREC_BITS]),
  416|  7.42M|                0);
  417|  7.42M|            v_coeff45 = _mm256_inserti128_si256(
  418|  7.42M|                v_coeff45,
  419|  7.42M|                _mm_loadu_si128(
  420|  7.42M|                    (__m128i *)av1_warped_filter[(sx + 5 * alpha) >>
  421|  7.42M|                                                 WARPEDDIFF_PREC_BITS]),
  422|  7.42M|                1);  // F7F6..F1F0E7E6..E1E0
  423|  7.42M|            __m256i v_coeff67 = _mm256_inserti128_si256(
  424|  7.42M|                v_zeros,
  425|  7.42M|                _mm_loadu_si128(
  426|  7.42M|                    (__m128i *)av1_warped_filter[(sx + 6 * alpha) >>
  427|  7.42M|                                                 WARPEDDIFF_PREC_BITS]),
  428|  7.42M|                0);
  429|  7.42M|            v_coeff67 = _mm256_inserti128_si256(
  430|  7.42M|                v_coeff67,
  431|  7.42M|                _mm_loadu_si128(
  432|  7.42M|                    (__m128i *)av1_warped_filter[(sx + 7 * alpha) >>
  433|  7.42M|                                                 WARPEDDIFF_PREC_BITS]),
  434|  7.42M|                1);  // H7H6..H1H0G7G6..G1G0
  435|       |
  436|  7.42M|            __m256i v_c0123 = _mm256_unpacklo_epi32(
  437|  7.42M|                v_coeff01,
  438|  7.42M|                v_coeff23);  // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0
  439|  7.42M|            __m256i v_c0123u = _mm256_unpackhi_epi32(
  440|  7.42M|                v_coeff01,
  441|  7.42M|                v_coeff23);  // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4
  442|  7.42M|            __m256i v_c4567 = _mm256_unpacklo_epi32(
  443|  7.42M|                v_coeff45,
  444|  7.42M|                v_coeff67);  // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0
  445|  7.42M|            __m256i v_c4567u = _mm256_unpackhi_epi32(
  446|  7.42M|                v_coeff45,
  447|  7.42M|                v_coeff67);  // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4
  448|       |
  449|  7.42M|            __m256i v_c01 = _mm256_unpacklo_epi64(
  450|  7.42M|                v_c0123, v_c4567);  // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0
  451|  7.42M|            __m256i v_c23 =
  452|  7.42M|                _mm256_unpackhi_epi64(v_c0123, v_c4567);  // H3H2 ... A3A2
  453|  7.42M|            __m256i v_c45 =
  454|  7.42M|                _mm256_unpacklo_epi64(v_c0123u, v_c4567u);  // H5H4 ... A5A4
  455|  7.42M|            __m256i v_c67 =
  456|  7.42M|                _mm256_unpackhi_epi64(v_c0123u, v_c4567u);  // H7H6 ... A7A6
  457|       |
  458|  7.42M|            __m256i v_refl = _mm256_inserti128_si256(
  459|  7.42M|                _mm256_setzero_si256(),
  460|  7.42M|                _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
  461|  7.42M|            v_refl = _mm256_inserti128_si256(
  462|  7.42M|                v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
  463|  7.42M|                1);  // R15 .. R0
  464|       |
  465|  7.42M|            __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
  466|       |
  467|  7.42M|            __m256i v_refu =
  468|  7.42M|                _mm256_alignr_epi8(v_ref, v_refl, 2);  // R8R15R14...R2R1
  469|       |
  470|  7.42M|            v_refl = _mm256_inserti128_si256(
  471|  7.42M|                v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
  472|  7.42M|            v_refu = _mm256_inserti128_si256(
  473|  7.42M|                v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
  474|       |
  475|  7.42M|            __m256i v_sum = _mm256_set1_epi32(ohoriz);
  476|  7.42M|            __m256i parsum =
  477|  7.42M|                _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0));
  478|  7.42M|            __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
  479|       |
  480|  7.42M|            parsum =
  481|  7.42M|                _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4));
  482|  7.42M|            __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
  483|  7.42M|            parsum =
  484|  7.42M|                _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8));
  485|  7.42M|            __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
  486|  7.42M|            parsum = _mm256_madd_epi16(v_c67,
  487|  7.42M|                                       _mm256_alignr_epi8(v_refu, v_refl, 12));
  488|  7.42M|            __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
  489|       |
  490|  7.42M|            tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
  491|  7.42M|                                           reduce_bits_horiz);
  492|  7.42M|          }
  493|   496k|        }
  494|  1.67M|      }
  495|       |
  496|       |      // Vertical filter
  497|  18.2M|      for (int k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
  ------------------
  |  |   34|  18.2M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 13.3M, False: 4.87M]
  |  |  ------------------
  ------------------
  |  Branch (497:24): [True: 16.1M, False: 2.04M]
  ------------------
  498|  16.1M|        int sy = sy4 + delta * (k + 4);
  499|  16.1M|        const __m256i *src = tmp + (k + 4);
  500|       |
  501|  16.1M|        __m256i v_coeff01 = _mm256_inserti128_si256(
  502|  16.1M|            v_zeros,
  503|  16.1M|            _mm_loadu_si128(
  504|  16.1M|                (__m128i *)av1_warped_filter[(sy) >> WARPEDDIFF_PREC_BITS]),
  505|  16.1M|            0);
  506|  16.1M|        v_coeff01 = _mm256_inserti128_si256(
  507|  16.1M|            v_coeff01,
  508|  16.1M|            _mm_loadu_si128(
  509|  16.1M|                (__m128i *)
  510|  16.1M|                    av1_warped_filter[(sy + gamma) >> WARPEDDIFF_PREC_BITS]),
  511|  16.1M|            1);
  512|  16.1M|        __m256i v_coeff23 = _mm256_inserti128_si256(
  513|  16.1M|            v_zeros,
  514|  16.1M|            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 2 * gamma) >>
  515|  16.1M|                                                         WARPEDDIFF_PREC_BITS]),
  516|  16.1M|            0);
  517|  16.1M|        v_coeff23 = _mm256_inserti128_si256(
  518|  16.1M|            v_coeff23,
  519|  16.1M|            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 3 * gamma) >>
  520|  16.1M|                                                         WARPEDDIFF_PREC_BITS]),
  521|  16.1M|            1);
  522|  16.1M|        __m256i v_coeff45 = _mm256_inserti128_si256(
  523|  16.1M|            v_zeros,
  524|  16.1M|            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 4 * gamma) >>
  525|  16.1M|                                                         WARPEDDIFF_PREC_BITS]),
  526|  16.1M|            0);
  527|  16.1M|        v_coeff45 = _mm256_inserti128_si256(
  528|  16.1M|            v_coeff45,
  529|  16.1M|            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 5 * gamma) >>
  530|  16.1M|                                                         WARPEDDIFF_PREC_BITS]),
  531|  16.1M|            1);
  532|  16.1M|        __m256i v_coeff67 = _mm256_inserti128_si256(
  533|  16.1M|            v_zeros,
  534|  16.1M|            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 6 * gamma) >>
  535|  16.1M|                                                         WARPEDDIFF_PREC_BITS]),
  536|  16.1M|            0);
  537|  16.1M|        v_coeff67 = _mm256_inserti128_si256(
  538|  16.1M|            v_coeff67,
  539|  16.1M|            _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 7 * gamma) >>
  540|  16.1M|                                                         WARPEDDIFF_PREC_BITS]),
  541|  16.1M|            1);
  542|       |
  543|  16.1M|        __m256i v_c0123 = _mm256_unpacklo_epi32(
  544|  16.1M|            v_coeff01,
  545|  16.1M|            v_coeff23);  // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0
  546|  16.1M|        __m256i v_c0123u = _mm256_unpackhi_epi32(
  547|  16.1M|            v_coeff01,
  548|  16.1M|            v_coeff23);  // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4
  549|  16.1M|        __m256i v_c4567 = _mm256_unpacklo_epi32(
  550|  16.1M|            v_coeff45,
  551|  16.1M|            v_coeff67);  // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0
  552|  16.1M|        __m256i v_c4567u = _mm256_unpackhi_epi32(
  553|  16.1M|            v_coeff45,
  554|  16.1M|            v_coeff67);  // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4
  555|       |
  556|  16.1M|        __m256i v_c01 = _mm256_unpacklo_epi64(
  557|  16.1M|            v_c0123, v_c4567);  // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0
  558|  16.1M|        __m256i v_c23 =
  559|  16.1M|            _mm256_unpackhi_epi64(v_c0123, v_c4567);  // H3H2 ... A3A2
  560|  16.1M|        __m256i v_c45 =
  561|  16.1M|            _mm256_unpacklo_epi64(v_c0123u, v_c4567u);  // H5H4 ... A5A4
  562|  16.1M|        __m256i v_c67 =
  563|  16.1M|            _mm256_unpackhi_epi64(v_c0123u, v_c4567u);  // H7H6 ... A7A6
  564|       |
  565|  16.1M|        __m256i v_src01l =
  566|  16.1M|            _mm256_unpacklo_epi32(src[0], src[1]);  // T13T03T11T01T12T02T10T00
  567|  16.1M|        __m256i v_src01u =
  568|  16.1M|            _mm256_unpackhi_epi32(src[0], src[1]);  // T17T07T15T05T16T06T14T04
  569|  16.1M|        __m256i v_sum =
  570|  16.1M|            _mm256_madd_epi16(_mm256_packus_epi32(v_src01l, v_src01u),
  571|  16.1M|                              v_c01);  // S7S5S3S1S6S4S2S0
  572|       |
  573|  16.1M|        __m256i v_src23l = _mm256_unpacklo_epi32(src[2], src[3]);
  574|  16.1M|        __m256i v_src23u = _mm256_unpackhi_epi32(src[2], src[3]);
  575|  16.1M|        v_sum = _mm256_add_epi32(
  576|  16.1M|            v_sum,
  577|  16.1M|            _mm256_madd_epi16(_mm256_packus_epi32(v_src23l, v_src23u), v_c23));
  578|       |
  579|  16.1M|        __m256i v_src45l = _mm256_unpacklo_epi32(src[4], src[5]);
  580|  16.1M|        __m256i v_src45u = _mm256_unpackhi_epi32(src[4], src[5]);
  581|  16.1M|        v_sum = _mm256_add_epi32(
  582|  16.1M|            v_sum,
  583|  16.1M|            _mm256_madd_epi16(_mm256_packus_epi32(v_src45l, v_src45u), v_c45));
  584|       |
  585|  16.1M|        __m256i v_src67l = _mm256_unpacklo_epi32(src[6], src[7]);
  586|  16.1M|        __m256i v_src67u = _mm256_unpackhi_epi32(src[6], src[7]);
  587|  16.1M|        v_sum = _mm256_add_epi32(
  588|  16.1M|            v_sum,
  589|  16.1M|            _mm256_madd_epi16(_mm256_packus_epi32(v_src67l, v_src67u), v_c67));
  590|       |
  591|       |        // unpack S7S5S3S1S6S4S2S0 to S7S6S5S4S3S2S1S0
  592|       |
  593|  16.1M|        __m256i v_suml =
  594|  16.1M|            _mm256_permute4x64_epi64(v_sum, 0xD8);  // S7S5S6S4S3S1S2S0
  595|  16.1M|        __m256i v_sumh =
  596|  16.1M|            _mm256_permute4x64_epi64(v_sum, 0x32);      // S2S0S7S5S2S0S3S1
  597|  16.1M|        v_sum = _mm256_unpacklo_epi32(v_suml, v_sumh);  // S7S6S5S4S3S2S1S0
  598|       |
  599|  16.1M|        if (conv_params->is_compound) {
  ------------------
  |  Branch (599:13): [True: 991k, False: 15.1M]
  ------------------
  600|   991k|          __m128i *const p =
  601|   991k|              (__m128i *)&conv_params
  602|   991k|                  ->dst[(i + k + 4) * conv_params->dst_stride + j];
  603|       |
  604|   991k|          v_sum = _mm256_add_epi32(v_sum, res_add_const);
  605|   991k|          v_sum =
  606|   991k|              _mm256_sra_epi32(_mm256_add_epi32(v_sum, reduce_bits_vert_const),
  607|   991k|                               reduce_bits_vert_shift);
  608|   991k|          if (conv_params->do_average) {
  ------------------
  |  Branch (608:15): [True: 135k, False: 856k]
  ------------------
  609|   135k|            __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
  610|   135k|            __m256i p_32 = _mm256_cvtepu16_epi32(_mm_loadu_si128(p));
  611|       |
  612|   135k|            if (conv_params->use_dist_wtd_comp_avg) {
  ------------------
  |  Branch (612:17): [True: 36.1k, False: 99.1k]
  ------------------
  613|  36.1k|              v_sum = _mm256_add_epi32(_mm256_mullo_epi32(p_32, wt0),
  614|  36.1k|                                       _mm256_mullo_epi32(v_sum, wt1));
  615|  36.1k|              v_sum = _mm256_srai_epi32(v_sum, DIST_PRECISION_BITS);
  ------------------
  |  |   76|  36.1k|#define DIST_PRECISION_BITS 4
  ------------------
  616|  99.1k|            } else {
  617|  99.1k|              v_sum = _mm256_srai_epi32(_mm256_add_epi32(p_32, v_sum), 1);
  618|  99.1k|            }
  619|       |
  620|   135k|            __m256i v_sum1 = _mm256_add_epi32(v_sum, res_sub_const);
  621|   135k|            v_sum1 = _mm256_sra_epi32(
  622|   135k|                _mm256_add_epi32(v_sum1, round_bits_const), round_bits_shift);
  623|       |
  624|   135k|            __m256i v_sum16 = _mm256_packus_epi32(v_sum1, v_sum1);
  625|   135k|            v_sum16 = _mm256_permute4x64_epi64(v_sum16, 0xD8);
  626|   135k|            v_sum16 = _mm256_min_epi16(v_sum16, clip_pixel);
  627|   135k|            _mm_storeu_si128(dst16, _mm256_extracti128_si256(v_sum16, 0));
  628|   856k|          } else {
  629|   856k|            v_sum = _mm256_packus_epi32(v_sum, v_sum);
  630|   856k|            __m256i v_sum16 = _mm256_permute4x64_epi64(v_sum, 0xD8);
  631|   856k|            _mm_storeu_si128(p, _mm256_extracti128_si256(v_sum16, 0));
  632|   856k|          }
  633|  15.1M|        } else {
  634|       |          // Round and pack into 8 bits
  635|  15.1M|          const __m256i round_const =
  636|  15.1M|              _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
  637|  15.1M|                                ((1 << reduce_bits_vert) >> 1));
  638|       |
  639|  15.1M|          __m256i v_sum1 = _mm256_srai_epi32(
  640|  15.1M|              _mm256_add_epi32(v_sum, round_const), reduce_bits_vert);
  641|       |
  642|  15.1M|          v_sum1 = _mm256_packus_epi32(v_sum1, v_sum1);
  643|  15.1M|          __m256i v_sum16 = _mm256_permute4x64_epi64(v_sum1, 0xD8);
  644|       |          // Clamp res_16bit to the range [0, 2^bd - 1]
  645|  15.1M|          const __m256i max_val = _mm256_set1_epi16((1 << bd) - 1);
  646|  15.1M|          const __m256i zero = _mm256_setzero_si256();
  647|  15.1M|          v_sum16 = _mm256_max_epi16(_mm256_min_epi16(v_sum16, max_val), zero);
  648|       |
  649|  15.1M|          __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
  650|       |
  651|  15.1M|          _mm_storeu_si128(p, _mm256_extracti128_si256(v_sum16, 0));
  652|  15.1M|        }
  653|  16.1M|      }
  654|  2.04M|    }
  655|   557k|  }
  656|   264k|}

av1_highbd_wiener_convolve_add_src_avx2:
   32|   366k|    const WienerConvolveParams *conv_params, int bd) {
   33|   366k|  assert(x_step_q4 == 16 && y_step_q4 == 16);
   34|   366k|  assert(!(w & 7));
   35|   366k|  assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
   36|   366k|  (void)x_step_q4;
   37|   366k|  (void)y_step_q4;
   38|       |
   39|   366k|  const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
  ------------------
  |  |   75|   366k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
   40|   366k|  uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);
  ------------------
  |  |   75|   366k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
   41|       |
   42|   366k|  DECLARE_ALIGNED(32, uint16_t,
  ------------------
  |  |   19|   366k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
   43|   366k|                  temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
   44|   366k|  int intermediate_height = h + SUBPEL_TAPS - 1;
  ------------------
  |  |   26|   366k|#define SUBPEL_TAPS 8
  ------------------
   45|   366k|  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
  ------------------
  |  |   26|   366k|#define SUBPEL_TAPS 8
  ------------------
   46|   366k|  const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;
   47|       |
   48|   366k|  const __m128i zero_128 = _mm_setzero_si128();
   49|   366k|  const __m256i zero_256 = _mm256_setzero_si256();
   50|       |
   51|       |  // Add an offset to account for the "add_src" part of the convolve function.
   52|   366k|  const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
   53|       |
   54|   366k|  const __m256i clamp_low = zero_256;
   55|       |
   56|       |  /* Horizontal filter */
   57|   366k|  {
   58|   366k|    const __m256i clamp_high_ep =
   59|   366k|        _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
  ------------------
  |  |   43|   366k|#define WIENER_CLAMP_LIMIT(r0, bd) (1 << ((bd) + 1 + FILTER_BITS - r0))
  |  |  ------------------
  |  |  |  |   21|   366k|#define FILTER_BITS 7
  |  |  ------------------
  ------------------
   60|       |
   61|       |    // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
   62|   366k|    const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
   63|       |
   64|       |    // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
   65|   366k|    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
   66|       |    // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
   67|   366k|    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
   68|       |
   69|       |    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
   70|   366k|    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
   71|       |    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
   72|   366k|    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
   73|       |    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
   74|   366k|    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
   75|       |    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
   76|   366k|    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
   77|       |
   78|       |    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
   79|   366k|    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
   80|       |    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
   81|   366k|    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
   82|       |    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
   83|   366k|    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
   84|       |    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
   85|   366k|    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
   86|       |
   87|   366k|    const __m256i round_const = _mm256_set1_epi32(
   88|   366k|        (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
  ------------------
  |  |   21|   366k|#define FILTER_BITS 7
  ------------------
   89|       |
   90|  11.4M|    for (int i = 0; i < intermediate_height; ++i) {
  ------------------
  |  Branch (90:21): [True: 11.0M, False: 366k]
  ------------------
   91|  37.7M|      for (int j = 0; j < w; j += 16) {
  ------------------
  |  Branch (91:23): [True: 26.6M, False: 11.0M]
  ------------------
   92|  26.6M|        const uint16_t *src_ij = src_ptr + i * src_stride + j;
   93|       |
   94|       |        // Load 16-bit src data
   95|  26.6M|        const __m256i src_0 = yy_loadu_256(src_ij + 0);
   96|  26.6M|        const __m256i src_1 = yy_loadu_256(src_ij + 1);
   97|  26.6M|        const __m256i src_2 = yy_loadu_256(src_ij + 2);
   98|  26.6M|        const __m256i src_3 = yy_loadu_256(src_ij + 3);
   99|  26.6M|        const __m256i src_4 = yy_loadu_256(src_ij + 4);
  100|  26.6M|        const __m256i src_5 = yy_loadu_256(src_ij + 5);
  101|  26.6M|        const __m256i src_6 = yy_loadu_256(src_ij + 6);
  102|  26.6M|        const __m256i src_7 = yy_loadu_256(src_ij + 7);
  103|       |
  104|       |        // Multiply src data by filter coeffs and sum pairs
  105|  26.6M|        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
  106|  26.6M|        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
  107|  26.6M|        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
  108|  26.6M|        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
  109|  26.6M|        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
  110|  26.6M|        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
  111|  26.6M|        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
  112|  26.6M|        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
  113|       |
  114|       |        // Calculate scalar product for even- and odd-indices separately,
  115|       |        // increasing to 32-bit precision
  116|  26.6M|        const __m256i res_even_sum = _mm256_add_epi32(
  117|  26.6M|            _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
  118|  26.6M|        const __m256i res_even = _mm256_srai_epi32(
  119|  26.6M|            _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0);
  120|       |
  121|  26.6M|        const __m256i res_odd_sum = _mm256_add_epi32(
  122|  26.6M|            _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
  123|  26.6M|        const __m256i res_odd = _mm256_srai_epi32(
  124|  26.6M|            _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0);
  125|       |
  126|       |        // Reduce to 16-bit precision and pack even- and odd-index results
  127|       |        // back into one register. The _mm256_packs_epi32 intrinsic returns
  128|       |        // a register with the pixels ordered as follows:
  129|       |        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
  130|  26.6M|        const __m256i res = _mm256_packs_epi32(res_even, res_odd);
  131|  26.6M|        const __m256i res_clamped =
  132|  26.6M|            _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high_ep);
  133|       |
  134|       |        // Store in a temporary array
  135|  26.6M|        yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
  ------------------
  |  |   32|  26.6M|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  26.6M|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  136|  26.6M|      }
  137|  11.0M|    }
  138|   366k|  }
  139|       |
  140|       |  /* Vertical filter */
  141|   366k|  {
  142|   366k|    const __m256i clamp_high = _mm256_set1_epi16((1 << bd) - 1);
  143|       |
  144|       |    // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
  145|   366k|    const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
  146|       |
  147|       |    // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
  148|   366k|    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
  149|       |    // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
  150|   366k|    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
  151|       |
  152|       |    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
  153|   366k|    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
  154|       |    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
  155|   366k|    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
  156|       |    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
  157|   366k|    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
  158|       |    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
  159|   366k|    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
  160|       |
  161|       |    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
  162|   366k|    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
  163|       |    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
  164|   366k|    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
  165|       |    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
  166|   366k|    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
  167|       |    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
  168|   366k|    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
  169|       |
  170|   366k|    const __m256i round_const =
  171|   366k|        _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
  172|   366k|                          (1 << (bd + conv_params->round_1 - 1)));
  173|       |
  174|  9.13M|    for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (174:21): [True: 8.76M, False: 366k]
  ------------------
  175|  28.6M|      for (int j = 0; j < w; j += 16) {
  ------------------
  |  Branch (175:23): [True: 19.9M, False: 8.76M]
  ------------------
  176|  19.9M|        const uint16_t *temp_ij = temp + i * MAX_SB_SIZE + j;
  ------------------
  |  |   32|  19.9M|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  19.9M|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  177|       |
  178|       |        // Load 16-bit data from the output of the horizontal filter in
  179|       |        // which the pixels are ordered as follows:
  180|       |        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
  181|  19.9M|        const __m256i data_0 = yy_loadu_256(temp_ij + 0 * MAX_SB_SIZE);
  ------------------
  |  |   32|  19.9M|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  19.9M|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  182|  19.9M|        const __m256i data_1 = yy_loadu_256(temp_ij + 1 * MAX_SB_SIZE);
  ------------------
  |  |   32|  19.9M|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  19.9M|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  183|  19.9M|        const __m256i data_2 = yy_loadu_256(temp_ij + 2 * MAX_SB_SIZE);
  ------------------
  |  |   32|  19.9M|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  19.9M|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  184|  19.9M|        const __m256i data_3 = yy_loadu_256(temp_ij + 3 * MAX_SB_SIZE);
  ------------------
  |  |   32|  19.9M|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  19.9M|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  185|  19.9M|        const __m256i data_4 = yy_loadu_256(temp_ij + 4 * MAX_SB_SIZE);
  ------------------
  |  |   32|  19.9M|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  19.9M|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  186|  19.9M|        const __m256i data_5 = yy_loadu_256(temp_ij + 5 * MAX_SB_SIZE);
  ------------------
  |  |   32|  19.9M|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  19.9M|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  187|  19.9M|        const __m256i data_6 = yy_loadu_256(temp_ij + 6 * MAX_SB_SIZE);
  ------------------
  |  |   32|  19.9M|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  19.9M|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  188|  19.9M|        const __m256i data_7 = yy_loadu_256(temp_ij + 7 * MAX_SB_SIZE);
  ------------------
  |  |   32|  19.9M|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|  19.9M|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  189|       |
  190|       |        // Filter the even-indices, increasing to 32-bit precision
  191|  19.9M|        const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
  192|  19.9M|        const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
  193|  19.9M|        const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
  194|  19.9M|        const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
  195|       |
  196|  19.9M|        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
  197|  19.9M|        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
  198|  19.9M|        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
  199|  19.9M|        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
  200|       |
  201|  19.9M|        const __m256i res_even = _mm256_add_epi32(
  202|  19.9M|            _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
  203|       |
  204|       |        // Filter the odd-indices, increasing to 32-bit precision
  205|  19.9M|        const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
  206|  19.9M|        const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
  207|  19.9M|        const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
  208|  19.9M|        const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
  209|       |
  210|  19.9M|        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
  211|  19.9M|        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
  212|  19.9M|        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
  213|  19.9M|        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
  214|       |
  215|  19.9M|        const __m256i res_odd = _mm256_add_epi32(
  216|  19.9M|            _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
  217|       |
  218|       |        // Pixels are currently in the following order:
  219|       |        // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
  220|       |        // res_odd order:  [ 15 13 11 9 ] [ 7 5 3 1 ]
  221|       |        //
  222|       |        // Rearrange the pixels into the following order:
  223|       |        // res_lo order: [ 11 10  9  8 ] [ 3 2 1 0 ]
  224|       |        // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
  225|  19.9M|        const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
  226|  19.9M|        const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
  227|       |
  228|  19.9M|        const __m256i res_lo_round = _mm256_srai_epi32(
  229|  19.9M|            _mm256_add_epi32(res_lo, round_const), conv_params->round_1);
  230|  19.9M|        const __m256i res_hi_round = _mm256_srai_epi32(
  231|  19.9M|            _mm256_add_epi32(res_hi, round_const), conv_params->round_1);
  232|       |
  233|       |        // Reduce to 16-bit precision and pack into the correct order:
  234|       |        // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
  235|  19.9M|        const __m256i res_16bit =
  236|  19.9M|            _mm256_packs_epi32(res_lo_round, res_hi_round);
  237|  19.9M|        const __m256i res_16bit_clamped = _mm256_min_epi16(
  238|  19.9M|            _mm256_max_epi16(res_16bit, clamp_low), clamp_high);
  239|       |
  240|       |        // Store in the dst array
  241|  19.9M|        yy_storeu_256(dst + i * dst_stride + j, res_16bit_clamped);
  242|  19.9M|      }
  243|  8.76M|    }
  244|   366k|  }
  245|   366k|}

av1_filter_intra_edge_sse4_1:
   18|  3.57M|void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) {
   19|  3.57M|  if (!strength) return;
  ------------------
  |  Branch (19:7): [True: 1.18M, False: 2.39M]
  ------------------
   20|       |
   21|  2.39M|  DECLARE_ALIGNED(16, static const int8_t, kern[3][16]) = {
  ------------------
  |  |   19|  2.39M|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
   22|  2.39M|    { 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0 },  // strength 1: 4,8,4
   23|  2.39M|    { 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0 },  // strength 2: 5,6,5
   24|  2.39M|    { 2, 4, 4, 4, 2, 0, 0, 0, 2, 4, 4, 4, 2, 0, 0, 0 }  // strength 3: 2,4,4,4,2
   25|  2.39M|  };
   26|       |
   27|  2.39M|  DECLARE_ALIGNED(16, static const int8_t, v_const[5][16]) = {
  ------------------
  |  |   19|  2.39M|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
   28|  2.39M|    { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
   29|  2.39M|    { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 },
   30|  2.39M|    { 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 },
   31|  2.39M|    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
   32|  2.39M|  };
   33|       |
   34|       |  // Extend the first and last samples to simplify the loop for the 5-tap case
   35|  2.39M|  p[-1] = p[0];
   36|  2.39M|  __m128i last = _mm_set1_epi8((char)p[sz - 1]);
   37|  2.39M|  _mm_storeu_si128((__m128i *)&p[sz], last);
   38|       |
   39|       |  // Adjust input pointer for filter support area
   40|  2.39M|  uint8_t *in = (strength == 3) ? p - 1 : p;
  ------------------
  |  Branch (40:17): [True: 1.26M, False: 1.12M]
  ------------------
   41|       |
   42|       |  // Avoid modifying first sample
   43|  2.39M|  uint8_t *out = p + 1;
   44|  2.39M|  int len = sz - 1;
   45|       |
   46|  2.39M|  const int use_3tap_filter = (strength < 3);
   47|       |
   48|  2.39M|  if (use_3tap_filter) {
  ------------------
  |  Branch (48:7): [True: 1.12M, False: 1.26M]
  ------------------
   49|  1.12M|    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
   50|  1.12M|    __m128i shuf0 = _mm_lddqu_si128((__m128i const *)v_const[0]);
   51|  1.12M|    __m128i shuf1 = _mm_lddqu_si128((__m128i const *)v_const[1]);
   52|  1.12M|    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
   53|  1.12M|    __m128i in0 = _mm_lddqu_si128((__m128i *)in);
   54|  2.83M|    while (len > 0) {
  ------------------
  |  Branch (54:12): [True: 1.71M, False: 1.12M]
  ------------------
   55|  1.71M|      int n_out = (len < 8) ? len : 8;
  ------------------
  |  Branch (55:19): [True: 607k, False: 1.10M]
  ------------------
   56|  1.71M|      __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
   57|  1.71M|      __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
   58|  1.71M|      d0 = _mm_maddubs_epi16(d0, coef0);
   59|  1.71M|      d1 = _mm_maddubs_epi16(d1, coef0);
   60|  1.71M|      d0 = _mm_hadd_epi16(d0, d1);
   61|  1.71M|      __m128i eight = _mm_set1_epi16(8);
   62|  1.71M|      d0 = _mm_add_epi16(d0, eight);
   63|  1.71M|      d0 = _mm_srai_epi16(d0, 4);
   64|  1.71M|      d0 = _mm_packus_epi16(d0, d0);
   65|  1.71M|      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
   66|  1.71M|      __m128i n0 = _mm_set1_epi8(n_out);
   67|  1.71M|      __m128i mask = _mm_cmpgt_epi8(n0, iden);
   68|  1.71M|      out0 = _mm_blendv_epi8(out0, d0, mask);
   69|  1.71M|      _mm_storel_epi64((__m128i *)out, out0);
   70|  1.71M|      __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
   71|  1.71M|      in0 = _mm_alignr_epi8(in1, in0, 8);
   72|  1.71M|      in += 8;
   73|  1.71M|      out += 8;
   74|  1.71M|      len -= n_out;
   75|  1.71M|    }
   76|  1.26M|  } else {  // 5-tap filter
   77|  1.26M|    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
   78|  1.26M|    __m128i two = _mm_set1_epi8(2);
   79|  1.26M|    __m128i shuf_a = _mm_lddqu_si128((__m128i const *)v_const[2]);
   80|  1.26M|    __m128i shuf_b = _mm_add_epi8(shuf_a, two);
   81|  1.26M|    __m128i shuf_c = _mm_add_epi8(shuf_b, two);
   82|  1.26M|    __m128i shuf_d = _mm_add_epi8(shuf_c, two);
   83|  1.26M|    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
   84|  1.26M|    __m128i in0 = _mm_lddqu_si128((__m128i *)in);
   85|  6.48M|    while (len > 0) {
  ------------------
  |  Branch (85:12): [True: 5.22M, False: 1.26M]
  ------------------
   86|  5.22M|      int n_out = (len < 8) ? len : 8;
  ------------------
  |  Branch (86:19): [True: 95.8k, False: 5.12M]
  ------------------
   87|  5.22M|      __m128i d0 = _mm_shuffle_epi8(in0, shuf_a);
   88|  5.22M|      __m128i d1 = _mm_shuffle_epi8(in0, shuf_b);
   89|  5.22M|      __m128i d2 = _mm_shuffle_epi8(in0, shuf_c);
   90|  5.22M|      __m128i d3 = _mm_shuffle_epi8(in0, shuf_d);
   91|  5.22M|      d0 = _mm_maddubs_epi16(d0, coef0);
   92|  5.22M|      d1 = _mm_maddubs_epi16(d1, coef0);
   93|  5.22M|      d2 = _mm_maddubs_epi16(d2, coef0);
   94|  5.22M|      d3 = _mm_maddubs_epi16(d3, coef0);
   95|  5.22M|      d0 = _mm_hadd_epi16(d0, d1);
   96|  5.22M|      d2 = _mm_hadd_epi16(d2, d3);
   97|  5.22M|      d0 = _mm_hadd_epi16(d0, d2);
   98|  5.22M|      __m128i eight = _mm_set1_epi16(8);
   99|  5.22M|      d0 = _mm_add_epi16(d0, eight);
  100|  5.22M|      d0 = _mm_srai_epi16(d0, 4);
  101|  5.22M|      d0 = _mm_packus_epi16(d0, d0);
  102|  5.22M|      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
  103|  5.22M|      __m128i n0 = _mm_set1_epi8(n_out);
  104|  5.22M|      __m128i mask = _mm_cmpgt_epi8(n0, iden);
  105|  5.22M|      out0 = _mm_blendv_epi8(out0, d0, mask);
  106|  5.22M|      _mm_storel_epi64((__m128i *)out, out0);
  107|  5.22M|      __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
  108|  5.22M|      in0 = _mm_alignr_epi8(in1, in0, 8);
  109|  5.22M|      in += 8;
  110|  5.22M|      out += 8;
  111|  5.22M|      len -= n_out;
  112|  5.22M|    }
  113|  1.26M|  }
  114|  2.39M|}
av1_upsample_intra_edge_sse4_1:
  116|   976k|void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) {
  117|       |  // interpolate half-sample positions
  118|   976k|  assert(sz <= 24);
  119|       |
  120|   976k|  DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = {
  ------------------
  |  |   19|   976k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
  121|   976k|    { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 }
  122|   976k|  };
  123|       |
  124|   976k|  DECLARE_ALIGNED(
  ------------------
  |  |   19|   976k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
  125|   976k|      16, static const int8_t,
  126|   976k|      v_const[2][16]) = { { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
  127|   976k|                          { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } };
  128|       |
  129|       |  // Extend first/last samples (upper-left p[-1], last p[sz-1])
  130|       |  // to support 4-tap filter
  131|   976k|  p[-2] = p[-1];
  132|   976k|  p[sz] = p[sz - 1];
  133|       |
  134|   976k|  uint8_t *in = &p[-2];
  135|   976k|  uint8_t *out = &p[-2];
  136|       |
  137|   976k|  int n = sz + 1;  // Input length including upper-left sample
  138|       |
  139|   976k|  __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
  140|   976k|  __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
  141|       |
  142|   976k|  __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
  143|   976k|  __m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]);
  144|   976k|  __m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]);
  145|       |
  146|  2.05M|  while (n > 0) {
  ------------------
  |  Branch (146:10): [True: 1.08M, False: 976k]
  ------------------
  147|  1.08M|    __m128i in8 = _mm_alignr_epi8(in16, in0, 8);
  148|  1.08M|    __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
  149|  1.08M|    __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
  150|  1.08M|    __m128i d2 = _mm_shuffle_epi8(in8, shuf0);
  151|  1.08M|    __m128i d3 = _mm_shuffle_epi8(in8, shuf1);
  152|  1.08M|    d0 = _mm_maddubs_epi16(d0, coef0);
  153|  1.08M|    d1 = _mm_maddubs_epi16(d1, coef0);
  154|  1.08M|    d2 = _mm_maddubs_epi16(d2, coef0);
  155|  1.08M|    d3 = _mm_maddubs_epi16(d3, coef0);
  156|  1.08M|    d0 = _mm_hadd_epi16(d0, d1);
  157|  1.08M|    d2 = _mm_hadd_epi16(d2, d3);
  158|  1.08M|    __m128i eight = _mm_set1_epi16(8);
  159|  1.08M|    d0 = _mm_add_epi16(d0, eight);
  160|  1.08M|    d2 = _mm_add_epi16(d2, eight);
  161|  1.08M|    d0 = _mm_srai_epi16(d0, 4);
  162|  1.08M|    d2 = _mm_srai_epi16(d2, 4);
  163|  1.08M|    d0 = _mm_packus_epi16(d0, d2);
  164|  1.08M|    __m128i in1 = _mm_alignr_epi8(in16, in0, 1);
  165|  1.08M|    __m128i out0 = _mm_unpacklo_epi8(in1, d0);
  166|  1.08M|    __m128i out1 = _mm_unpackhi_epi8(in1, d0);
  167|  1.08M|    _mm_storeu_si128((__m128i *)&out[0], out0);
  168|  1.08M|    _mm_storeu_si128((__m128i *)&out[16], out1);
  169|  1.08M|    in0 = in16;
  170|  1.08M|    in16 = _mm_setzero_si128();
  171|  1.08M|    out += 32;
  172|  1.08M|    n -= 16;
  173|  1.08M|  }
  174|   976k|}
av1_highbd_filter_intra_edge_sse4_1:
  178|  3.57M|void av1_highbd_filter_intra_edge_sse4_1(uint16_t *p, int sz, int strength) {
  179|  3.57M|  if (!strength) return;
  ------------------
  |  Branch (179:7): [True: 1.10M, False: 2.47M]
  ------------------
  180|       |
  181|  2.47M|  DECLARE_ALIGNED(16, static const int16_t, kern[3][8]) = {
  ------------------
  |  |   19|  2.47M|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
  182|  2.47M|    { 4, 8, 4, 8, 4, 8, 4, 8 },  // strength 1: 4,8,4
  183|  2.47M|    { 5, 6, 5, 6, 5, 6, 5, 6 },  // strength 2: 5,6,5
  184|  2.47M|    { 2, 4, 2, 4, 2, 4, 2, 4 }   // strength 3: 2,4,4,4,2
  185|  2.47M|  };
  186|       |
  187|  2.47M|  DECLARE_ALIGNED(16, static const int16_t,
  ------------------
  |  |   19|  2.47M|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
  188|  2.47M|                  v_const[1][8]) = { { 0, 1, 2, 3, 4, 5, 6, 7 } };
  189|       |
  190|       |  // Extend the first and last samples to simplify the loop for the 5-tap case
  191|  2.47M|  p[-1] = p[0];
  192|  2.47M|  __m128i last = _mm_set1_epi16(p[sz - 1]);
  193|  2.47M|  _mm_storeu_si128((__m128i *)&p[sz], last);
  194|       |
  195|       |  // Adjust input pointer for filter support area
  196|  2.47M|  uint16_t *in = (strength == 3) ? p - 1 : p;
  ------------------
  |  Branch (196:18): [True: 1.46M, False: 1.00M]
  ------------------
  197|       |
  198|       |  // Avoid modifying first sample
  199|  2.47M|  uint16_t *out = p + 1;
  200|  2.47M|  int len = sz - 1;
  201|       |
  202|  2.47M|  const int use_3tap_filter = (strength < 3);
  203|       |
  204|  2.47M|  if (use_3tap_filter) {
  ------------------
  |  Branch (204:7): [True: 1.00M, False: 1.46M]
  ------------------
  205|  1.00M|    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
  206|  1.00M|    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
  207|  1.00M|    __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
  208|  1.00M|    __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
  209|  2.76M|    while (len > 0) {
  ------------------
  |  Branch (209:12): [True: 1.76M, False: 1.00M]
  ------------------
  210|  1.76M|      int n_out = (len < 8) ? len : 8;
  ------------------
  |  Branch (210:19): [True: 374k, False: 1.38M]
  ------------------
  211|  1.76M|      __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
  212|  1.76M|      __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
  213|  1.76M|      __m128i in02 = _mm_add_epi16(in0, in2);
  214|  1.76M|      __m128i d0 = _mm_unpacklo_epi16(in02, in1);
  215|  1.76M|      __m128i d1 = _mm_unpackhi_epi16(in02, in1);
  216|  1.76M|      d0 = _mm_mullo_epi16(d0, coef0);
  217|  1.76M|      d1 = _mm_mullo_epi16(d1, coef0);
  218|  1.76M|      d0 = _mm_hadd_epi16(d0, d1);
  219|  1.76M|      __m128i eight = _mm_set1_epi16(8);
  220|  1.76M|      d0 = _mm_add_epi16(d0, eight);
  221|  1.76M|      d0 = _mm_srli_epi16(d0, 4);
  222|  1.76M|      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
  223|  1.76M|      __m128i n0 = _mm_set1_epi16(n_out);
  224|  1.76M|      __m128i mask = _mm_cmpgt_epi16(n0, iden);
  225|  1.76M|      out0 = _mm_blendv_epi8(out0, d0, mask);
  226|  1.76M|      _mm_storeu_si128((__m128i *)out, out0);
  227|  1.76M|      in += 8;
  228|  1.76M|      in0 = in8;
  229|  1.76M|      in8 = _mm_lddqu_si128((__m128i *)&in[8]);
  230|  1.76M|      out += 8;
  231|  1.76M|      len -= n_out;
  232|  1.76M|    }
  233|  1.46M|  } else {  // 5-tap filter
  234|  1.46M|    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
  235|  1.46M|    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
  236|  1.46M|    __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
  237|  1.46M|    __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
  238|  7.42M|    while (len > 0) {
  ------------------
  |  Branch (238:12): [True: 5.95M, False: 1.46M]
  ------------------
  239|  5.95M|      int n_out = (len < 8) ? len : 8;
  ------------------
  |  Branch (239:19): [True: 108k, False: 5.84M]
  ------------------
  240|  5.95M|      __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
  241|  5.95M|      __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
  242|  5.95M|      __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
  243|  5.95M|      __m128i in4 = _mm_alignr_epi8(in8, in0, 8);
  244|  5.95M|      __m128i in04 = _mm_add_epi16(in0, in4);
  245|  5.95M|      __m128i in123 = _mm_add_epi16(in1, in2);
  246|  5.95M|      in123 = _mm_add_epi16(in123, in3);
  247|  5.95M|      __m128i d0 = _mm_unpacklo_epi16(in04, in123);
  248|  5.95M|      __m128i d1 = _mm_unpackhi_epi16(in04, in123);
  249|  5.95M|      d0 = _mm_mullo_epi16(d0, coef0);
  250|  5.95M|      d1 = _mm_mullo_epi16(d1, coef0);
  251|  5.95M|      d0 = _mm_hadd_epi16(d0, d1);
  252|  5.95M|      __m128i eight = _mm_set1_epi16(8);
  253|  5.95M|      d0 = _mm_add_epi16(d0, eight);
  254|  5.95M|      d0 = _mm_srli_epi16(d0, 4);
  255|  5.95M|      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
  256|  5.95M|      __m128i n0 = _mm_set1_epi16(n_out);
  257|  5.95M|      __m128i mask = _mm_cmpgt_epi16(n0, iden);
  258|  5.95M|      out0 = _mm_blendv_epi8(out0, d0, mask);
  259|  5.95M|      _mm_storeu_si128((__m128i *)out, out0);
  260|  5.95M|      in += 8;
  261|  5.95M|      in0 = in8;
  262|  5.95M|      in8 = _mm_lddqu_si128((__m128i *)&in[8]);
  263|  5.95M|      out += 8;
  264|  5.95M|      len -= n_out;
  265|  5.95M|    }
  266|  1.46M|  }
  267|  2.47M|}
av1_highbd_upsample_intra_edge_sse4_1:
  269|   847k|void av1_highbd_upsample_intra_edge_sse4_1(uint16_t *p, int sz, int bd) {
  270|       |  // interpolate half-sample positions
  271|   847k|  assert(sz <= 24);
  272|       |
  273|   847k|  DECLARE_ALIGNED(16, static const int16_t,
  ------------------
  |  |   19|   847k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
  274|   847k|                  kernel[1][8]) = { { -1, 9, -1, 9, -1, 9, -1, 9 } };
  275|       |
  276|       |  // Extend first/last samples (upper-left p[-1], last p[sz-1])
  277|       |  // to support 4-tap filter
  278|   847k|  p[-2] = p[-1];
  279|   847k|  p[sz] = p[sz - 1];
  280|       |
  281|   847k|  uint16_t *in = &p[-2];
  282|   847k|  uint16_t *out = in;
  283|   847k|  int n = sz + 1;
  284|       |
  285|   847k|  __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
  286|   847k|  __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
  287|   847k|  __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
  288|   847k|  __m128i in24 = _mm_lddqu_si128((__m128i *)&in[24]);
  289|       |
  290|  2.46M|  while (n > 0) {
  ------------------
  |  Branch (290:10): [True: 1.62M, False: 847k]
  ------------------
  291|  1.62M|    __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
  292|  1.62M|    __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
  293|  1.62M|    __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
  294|  1.62M|    __m128i sum0 = _mm_add_epi16(in0, in3);
  295|  1.62M|    __m128i sum1 = _mm_add_epi16(in1, in2);
  296|  1.62M|    __m128i d0 = _mm_unpacklo_epi16(sum0, sum1);
  297|  1.62M|    __m128i d1 = _mm_unpackhi_epi16(sum0, sum1);
  298|  1.62M|    __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
  299|  1.62M|    d0 = _mm_madd_epi16(d0, coef0);
  300|  1.62M|    d1 = _mm_madd_epi16(d1, coef0);
  301|  1.62M|    __m128i eight = _mm_set1_epi32(8);
  302|  1.62M|    d0 = _mm_add_epi32(d0, eight);
  303|  1.62M|    d1 = _mm_add_epi32(d1, eight);
  304|  1.62M|    d0 = _mm_srai_epi32(d0, 4);
  305|  1.62M|    d1 = _mm_srai_epi32(d1, 4);
  306|  1.62M|    d0 = _mm_packus_epi32(d0, d1);
  307|  1.62M|    __m128i max0 = _mm_set1_epi16((1 << bd) - 1);
  308|  1.62M|    d0 = _mm_min_epi16(d0, max0);
  309|  1.62M|    __m128i out0 = _mm_unpacklo_epi16(in1, d0);
  310|  1.62M|    __m128i out1 = _mm_unpackhi_epi16(in1, d0);
  311|  1.62M|    _mm_storeu_si128((__m128i *)&out[0], out0);
  312|  1.62M|    _mm_storeu_si128((__m128i *)&out[8], out1);
  313|  1.62M|    in0 = in8;
  314|  1.62M|    in8 = in16;
  315|  1.62M|    in16 = in24;
  316|  1.62M|    in24 = _mm_setzero_si128();
  317|  1.62M|    out += 16;
  318|  1.62M|    n -= 8;
  319|  1.62M|  }
  320|   847k|}

av1_dist_wtd_convolve_x_avx2:
   46|   153k|                                  ConvolveParams *conv_params) {
   47|   153k|  CONV_BUF_TYPE *dst = conv_params->dst;
   48|   153k|  int dst_stride = conv_params->dst_stride;
   49|   153k|  const int bd = 8;
   50|   153k|  int i, j, is_horiz_4tap = 0;
   51|   153k|  const int bits = FILTER_BITS - conv_params->round_1;
  ------------------
  |  |   21|   153k|#define FILTER_BITS 7
  ------------------
   52|   153k|  const __m256i wt = unpack_weights_avx2(conv_params);
   53|   153k|  const int do_average = conv_params->do_average;
   54|   153k|  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
   55|   153k|  const int offset_0 =
   56|   153k|      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|   153k|#define FILTER_BITS 7
  ------------------
   57|   153k|  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
   58|   153k|  const __m256i offset_const = _mm256_set1_epi16(offset);
   59|   153k|  const int rounding_shift =
   60|   153k|      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|   153k|#define FILTER_BITS 7
  ------------------
   61|   153k|  const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
   62|       |
   63|   153k|  assert(bits >= 0);
   64|   153k|  assert(conv_params->round_0 > 0);
   65|       |
   66|   153k|  const __m256i round_const =
   67|   153k|      _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
   68|   153k|  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
   69|       |
   70|   153k|  __m256i filt[4], coeffs[4];
   71|       |
   72|   153k|  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
   73|   153k|  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
   74|       |
   75|   153k|  prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
   76|       |
   77|       |  // Condition for checking valid horz_filt taps
   78|   153k|  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
  ------------------
  |  Branch (78:7): [True: 58.8k, False: 94.9k]
  ------------------
   79|  58.8k|    is_horiz_4tap = 1;
   80|       |
   81|       |  // horz_filt as 4 tap
   82|   153k|  if (is_horiz_4tap) {
  ------------------
  |  Branch (82:7): [True: 58.8k, False: 94.9k]
  ------------------
   83|  58.8k|    const int fo_horiz = 1;
   84|  58.8k|    const uint8_t *const src_ptr = src - fo_horiz;
   85|   463k|    for (i = 0; i < h; i += 2) {
  ------------------
  |  Branch (85:17): [True: 404k, False: 58.8k]
  ------------------
   86|   404k|      const uint8_t *src_data = src_ptr + i * src_stride;
   87|   404k|      CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
   88|  2.06M|      for (j = 0; j < w; j += 8) {
  ------------------
  |  Branch (88:19): [True: 1.66M, False: 404k]
  ------------------
   89|  1.66M|        const __m256i data =
   90|  1.66M|            load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
   91|       |
   92|  1.66M|        __m256i res = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
   93|  1.66M|        res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
   94|  1.66M|        res = _mm256_slli_epi16(res, bits);
   95|       |
   96|  1.66M|        const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
   97|       |
   98|       |        // Accumulate values into the destination buffer
   99|  1.66M|        if (do_average) {
  ------------------
  |  Branch (99:13): [True: 461k, False: 1.20M]
  ------------------
  100|   461k|          const __m256i data_ref_0 =
  101|   461k|              load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
  102|   461k|          const __m256i comp_avg_res =
  103|   461k|              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
  104|       |
  105|   461k|          const __m256i round_result = convolve_rounding(
  106|   461k|              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
  107|       |
  108|   461k|          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
  109|   461k|          const __m128i res_0 = _mm256_castsi256_si128(res_8);
  110|   461k|          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
  111|       |
  112|   461k|          if (w > 4) {
  ------------------
  |  Branch (112:15): [True: 424k, False: 37.3k]
  ------------------
  113|   424k|            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  114|   424k|            _mm_storel_epi64(
  115|   424k|                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
  116|   424k|          } else {
  117|  37.3k|            *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
  118|  37.3k|            *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
  119|  37.3k|                _mm_cvtsi128_si32(res_1);
  120|  37.3k|          }
  121|  1.20M|        } else {
  122|  1.20M|          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
  123|  1.20M|          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
  124|       |
  125|  1.20M|          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
  126|  1.20M|          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  127|  1.20M|                          res_1);
  128|  1.20M|        }
  129|  1.66M|      }
  130|   404k|    }
  131|  94.9k|  } else {
  132|  94.9k|    const int fo_horiz = filter_params_x->taps / 2 - 1;
  133|  94.9k|    const uint8_t *const src_ptr = src - fo_horiz;
  134|       |
  135|  94.9k|    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
  136|  94.9k|    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
  137|   879k|    for (i = 0; i < h; i += 2) {
  ------------------
  |  Branch (137:17): [True: 784k, False: 94.9k]
  ------------------
  138|   784k|      const uint8_t *src_data = src_ptr + i * src_stride;
  139|   784k|      CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
  140|  3.89M|      for (j = 0; j < w; j += 8) {
  ------------------
  |  Branch (140:19): [True: 3.10M, False: 784k]
  ------------------
  141|  3.10M|        const __m256i data =
  142|  3.10M|            load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
  143|       |
  144|  3.10M|        __m256i res = convolve_lowbd_x(data, coeffs, filt);
  145|       |
  146|  3.10M|        res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
  147|       |
  148|  3.10M|        res = _mm256_slli_epi16(res, bits);
  149|       |
  150|  3.10M|        const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
  151|       |
  152|       |        // Accumulate values into the destination buffer
  153|  3.10M|        if (do_average) {
  ------------------
  |  Branch (153:13): [True: 1.41M, False: 1.68M]
  ------------------
  154|  1.41M|          const __m256i data_ref_0 =
  155|  1.41M|              load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
  156|  1.41M|          const __m256i comp_avg_res =
  157|  1.41M|              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
  158|       |
  159|  1.41M|          const __m256i round_result = convolve_rounding(
  160|  1.41M|              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
  161|       |
  162|  1.41M|          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
  163|  1.41M|          const __m128i res_0 = _mm256_castsi256_si128(res_8);
  164|  1.41M|          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
  165|       |
  166|  1.41M|          if (w > 4) {
  ------------------
  |  Branch (166:15): [True: 1.41M, False: 2.30k]
  ------------------
  167|  1.41M|            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  168|  1.41M|            _mm_storel_epi64(
  169|  1.41M|                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
  170|  1.41M|          } else {
  171|  2.30k|            *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
  172|  2.30k|            *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
  173|  2.30k|                _mm_cvtsi128_si32(res_1);
  174|  2.30k|          }
  175|  1.68M|        } else {
  176|  1.68M|          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
  177|  1.68M|          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
  178|       |
  179|  1.68M|          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
  180|  1.68M|          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  181|  1.68M|                          res_1);
  182|  1.68M|        }
  183|  3.10M|      }
  184|   784k|    }
  185|  94.9k|  }
  186|   153k|}
av1_dist_wtd_convolve_y_avx2:
  192|  79.3k|                                  ConvolveParams *conv_params) {
  193|  79.3k|  CONV_BUF_TYPE *dst = conv_params->dst;
  194|  79.3k|  int dst_stride = conv_params->dst_stride;
  195|  79.3k|  const int bd = 8;
  196|  79.3k|  int i, j, is_vert_4tap = 0;
  197|       |  // +1 to compensate for dividing the filter coeffs by 2
  198|  79.3k|  const int left_shift = FILTER_BITS - conv_params->round_0 + 1;
  ------------------
  |  |   21|  79.3k|#define FILTER_BITS 7
  ------------------
  199|  79.3k|  const __m256i round_const =
  200|  79.3k|      _mm256_set1_epi32((1 << conv_params->round_1) >> 1);
  201|  79.3k|  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
  202|  79.3k|  const __m256i wt = unpack_weights_avx2(conv_params);
  203|  79.3k|  const int do_average = conv_params->do_average;
  204|  79.3k|  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
  205|  79.3k|  const int offset_0 =
  206|  79.3k|      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|  79.3k|#define FILTER_BITS 7
  ------------------
  207|  79.3k|  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
  208|  79.3k|  const __m256i offset_const = _mm256_set1_epi16(offset);
  209|  79.3k|  const int offset_1 = (1 << (bd + FILTER_BITS - 2));
  ------------------
  |  |   21|  79.3k|#define FILTER_BITS 7
  ------------------
  210|  79.3k|  const __m256i offset_const_1 = _mm256_set1_epi16(offset_1);
  211|  79.3k|  const __m256i offset_const_2 = _mm256_set1_epi16((1 << offset_0));
  212|  79.3k|  const int rounding_shift =
  213|  79.3k|      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|  79.3k|#define FILTER_BITS 7
  ------------------
  214|  79.3k|  const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
  215|  79.3k|  const __m256i zero = _mm256_setzero_si256();
  216|  79.3k|  __m256i coeffs[4], s[8];
  217|       |
  218|  79.3k|  assert((FILTER_BITS - conv_params->round_0) >= 0);
  219|       |
  220|  79.3k|  prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
  221|       |
  222|       |  // Condition for checking valid vert_filt taps
  223|  79.3k|  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
  ------------------
  |  Branch (223:7): [True: 35.2k, False: 44.1k]
  ------------------
  224|  35.2k|    is_vert_4tap = 1;
  225|       |
  226|  79.3k|  if (is_vert_4tap) {
  ------------------
  |  Branch (226:7): [True: 35.2k, False: 44.1k]
  ------------------
  227|  35.2k|    const int fo_vert = 1;
  228|  35.2k|    const uint8_t *const src_ptr = src - fo_vert * src_stride;
  229|  72.8k|    for (j = 0; j < w; j += 16) {
  ------------------
  |  Branch (229:17): [True: 37.5k, False: 35.2k]
  ------------------
  230|  37.5k|      const uint8_t *data = &src_ptr[j];
  231|  37.5k|      __m256i src4;
  232|       |      // Load lines a and b. Line a to lower 128, line b to upper 128
  233|  37.5k|      {
  234|  37.5k|        __m256i src_ab[4];
  235|  37.5k|        __m256i src_a[5];
  236|  37.5k|        src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
  237|   187k|        for (int kk = 0; kk < 4; ++kk) {
  ------------------
  |  Branch (237:26): [True: 150k, False: 37.5k]
  ------------------
  238|   150k|          data += src_stride;
  239|   150k|          src_a[kk + 1] =
  240|   150k|              _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
  241|   150k|          src_ab[kk] =
  242|   150k|              _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
  243|   150k|        }
  244|  37.5k|        src4 = src_a[4];
  245|  37.5k|        s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
  246|  37.5k|        s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
  247|       |
  248|  37.5k|        s[3] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
  249|  37.5k|        s[4] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
  250|  37.5k|      }
  251|       |
  252|   238k|      for (i = 0; i < h; i += 2) {
  ------------------
  |  Branch (252:19): [True: 201k, False: 37.5k]
  ------------------
  253|   201k|        data = &src_ptr[(i + 5) * src_stride + j];
  254|   201k|        const __m256i src5 =
  255|   201k|            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
  256|   201k|        const __m256i src_45a = _mm256_permute2x128_si256(src4, src5, 0x20);
  257|       |
  258|   201k|        src4 = _mm256_castsi128_si256(
  259|   201k|            _mm_loadu_si128((__m128i *)(data + src_stride)));
  260|   201k|        const __m256i src_56a = _mm256_permute2x128_si256(src5, src4, 0x20);
  261|       |
  262|   201k|        s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
  263|   201k|        s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
  264|       |
  265|   201k|        __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
  266|       |
  267|   201k|        res_lo = _mm256_add_epi16(res_lo, offset_const_1);
  268|       |
  269|   201k|        const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
  270|   201k|        const __m256i res_lo_0_shift =
  271|   201k|            _mm256_slli_epi32(res_lo_0_32b, left_shift);
  272|   201k|        const __m256i res_lo_0_round = _mm256_sra_epi32(
  273|   201k|            _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
  274|       |
  275|   201k|        const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
  276|   201k|        const __m256i res_lo_1_shift =
  277|   201k|            _mm256_slli_epi32(res_lo_1_32b, left_shift);
  278|   201k|        const __m256i res_lo_1_round = _mm256_sra_epi32(
  279|   201k|            _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
  280|       |
  281|   201k|        const __m256i res_lo_round =
  282|   201k|            _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
  283|       |
  284|   201k|        const __m256i res_lo_unsigned =
  285|   201k|            _mm256_add_epi16(res_lo_round, offset_const_2);
  286|       |
  287|   201k|        if (w - j < 16) {
  ------------------
  |  Branch (287:13): [True: 87.3k, False: 114k]
  ------------------
  288|  87.3k|          if (do_average) {
  ------------------
  |  Branch (288:15): [True: 59.1k, False: 28.2k]
  ------------------
  289|  59.1k|            const __m256i data_ref_0 =
  290|  59.1k|                load_line2_avx2(&dst[i * dst_stride + j],
  291|  59.1k|                                &dst[i * dst_stride + j + dst_stride]);
  292|  59.1k|            const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned,
  293|  59.1k|                                                  &wt, use_dist_wtd_comp_avg);
  294|       |
  295|  59.1k|            const __m256i round_result = convolve_rounding(
  296|  59.1k|                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
  297|       |
  298|  59.1k|            const __m256i res_8 =
  299|  59.1k|                _mm256_packus_epi16(round_result, round_result);
  300|  59.1k|            const __m128i res_0 = _mm256_castsi256_si128(res_8);
  301|  59.1k|            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
  302|       |
  303|  59.1k|            if (w - j > 4) {
  ------------------
  |  Branch (303:17): [True: 31.0k, False: 28.0k]
  ------------------
  304|  31.0k|              _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  305|  31.0k|              _mm_storel_epi64(
  306|  31.0k|                  (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])),
  307|  31.0k|                  res_1);
  308|  31.0k|            } else {
  309|  28.0k|              *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
  310|  28.0k|              *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
  311|  28.0k|                  _mm_cvtsi128_si32(res_1);
  312|  28.0k|            }
  313|  59.1k|          } else {
  314|  28.2k|            const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
  315|  28.2k|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
  316|       |
  317|  28.2k|            const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
  318|  28.2k|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  319|  28.2k|                            res_1);
  320|  28.2k|          }
  321|   114k|        } else {
  322|   114k|          __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
  323|       |
  324|   114k|          res_hi = _mm256_add_epi16(res_hi, offset_const_1);
  325|       |
  326|   114k|          const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
  327|   114k|          const __m256i res_hi_0_shift =
  328|   114k|              _mm256_slli_epi32(res_hi_0_32b, left_shift);
  329|   114k|          const __m256i res_hi_0_round = _mm256_sra_epi32(
  330|   114k|              _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
  331|       |
  332|   114k|          const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
  333|   114k|          const __m256i res_hi_1_shift =
  334|   114k|              _mm256_slli_epi32(res_hi_1_32b, left_shift);
  335|   114k|          const __m256i res_hi_1_round = _mm256_sra_epi32(
  336|   114k|              _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
  337|       |
  338|   114k|          const __m256i res_hi_round =
  339|   114k|              _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
  340|       |
  341|   114k|          const __m256i res_hi_unsigned =
  342|   114k|              _mm256_add_epi16(res_hi_round, offset_const_2);
  343|       |
  344|   114k|          if (do_average) {
  ------------------
  |  Branch (344:15): [True: 34.3k, False: 79.7k]
  ------------------
  345|  34.3k|            const __m256i data_ref_0_lo =
  346|  34.3k|                load_line2_avx2(&dst[i * dst_stride + j],
  347|  34.3k|                                &dst[i * dst_stride + j + dst_stride]);
  348|       |
  349|  34.3k|            const __m256i data_ref_0_hi =
  350|  34.3k|                load_line2_avx2(&dst[i * dst_stride + j + 8],
  351|  34.3k|                                &dst[i * dst_stride + j + 8 + dst_stride]);
  352|       |
  353|  34.3k|            const __m256i comp_avg_res_lo = comp_avg(
  354|  34.3k|                &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg);
  355|       |
  356|  34.3k|            const __m256i comp_avg_res_hi = comp_avg(
  357|  34.3k|                &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg);
  358|       |
  359|  34.3k|            const __m256i round_result_lo =
  360|  34.3k|                convolve_rounding(&comp_avg_res_lo, &offset_const,
  361|  34.3k|                                  &rounding_const, rounding_shift);
  362|       |
  363|  34.3k|            const __m256i round_result_hi =
  364|  34.3k|                convolve_rounding(&comp_avg_res_hi, &offset_const,
  365|  34.3k|                                  &rounding_const, rounding_shift);
  366|       |
  367|  34.3k|            const __m256i res_8 =
  368|  34.3k|                _mm256_packus_epi16(round_result_lo, round_result_hi);
  369|  34.3k|            const __m128i res_0 = _mm256_castsi256_si128(res_8);
  370|  34.3k|            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
  371|       |
  372|  34.3k|            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  373|  34.3k|            _mm_store_si128(
  374|  34.3k|                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
  375|       |
  376|  79.7k|          } else {
  377|  79.7k|            const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
  378|  79.7k|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
  379|       |
  380|  79.7k|            const __m128i res_lo_1 =
  381|  79.7k|                _mm256_extracti128_si256(res_lo_unsigned, 1);
  382|  79.7k|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  383|  79.7k|                            res_lo_1);
  384|       |
  385|  79.7k|            const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
  386|  79.7k|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]),
  387|  79.7k|                            res_hi_0);
  388|       |
  389|  79.7k|            const __m128i res_hi_1 =
  390|  79.7k|                _mm256_extracti128_si256(res_hi_unsigned, 1);
  391|  79.7k|            _mm_store_si128(
  392|  79.7k|                (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]),
  393|  79.7k|                res_hi_1);
  394|  79.7k|          }
  395|   114k|        }
  396|   201k|        s[0] = s[1];
  397|   201k|        s[1] = s[2];
  398|       |
  399|   201k|        s[3] = s[4];
  400|   201k|        s[4] = s[5];
  401|   201k|      }
  402|  37.5k|    }
  403|  44.1k|  } else {
  404|  44.1k|    const int fo_vert = filter_params_y->taps / 2 - 1;
  405|  44.1k|    const uint8_t *const src_ptr = src - fo_vert * src_stride;
  406|   103k|    for (j = 0; j < w; j += 16) {
  ------------------
  |  Branch (406:17): [True: 59.6k, False: 44.1k]
  ------------------
  407|  59.6k|      const uint8_t *data = &src_ptr[j];
  408|  59.6k|      __m256i src6;
  409|       |      // Load lines a and b. Line a to lower 128, line b to upper 128
  410|  59.6k|      {
  411|  59.6k|        __m256i src_ab[7];
  412|  59.6k|        __m256i src_a[7];
  413|  59.6k|        src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
  414|   417k|        for (int kk = 0; kk < 6; ++kk) {
  ------------------
  |  Branch (414:26): [True: 357k, False: 59.6k]
  ------------------
  415|   357k|          data += src_stride;
  416|   357k|          src_a[kk + 1] =
  417|   357k|              _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
  418|   357k|          src_ab[kk] =
  419|   357k|              _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
  420|   357k|        }
  421|  59.6k|        src6 = src_a[6];
  422|  59.6k|        s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
  423|  59.6k|        s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
  424|  59.6k|        s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]);
  425|  59.6k|        s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
  426|  59.6k|        s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
  427|  59.6k|        s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]);
  428|  59.6k|      }
  429|       |
  430|   697k|      for (i = 0; i < h; i += 2) {
  ------------------
  |  Branch (430:19): [True: 638k, False: 59.6k]
  ------------------
  431|   638k|        data = &src_ptr[(i + 7) * src_stride + j];
  432|   638k|        const __m256i src7 =
  433|   638k|            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
  434|   638k|        const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20);
  435|       |
  436|   638k|        src6 = _mm256_castsi128_si256(
  437|   638k|            _mm_loadu_si128((__m128i *)(data + src_stride)));
  438|   638k|        const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20);
  439|       |
  440|   638k|        s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
  441|   638k|        s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
  442|       |
  443|   638k|        __m256i res_lo = convolve_lowbd(s, coeffs);
  444|       |
  445|   638k|        res_lo = _mm256_add_epi16(res_lo, offset_const_1);
  446|       |
  447|   638k|        const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
  448|   638k|        const __m256i res_lo_0_shift =
  449|   638k|            _mm256_slli_epi32(res_lo_0_32b, left_shift);
  450|   638k|        const __m256i res_lo_0_round = _mm256_sra_epi32(
  451|   638k|            _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
  452|       |
  453|   638k|        const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
  454|   638k|        const __m256i res_lo_1_shift =
  455|   638k|            _mm256_slli_epi32(res_lo_1_32b, left_shift);
  456|   638k|        const __m256i res_lo_1_round = _mm256_sra_epi32(
  457|   638k|            _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
  458|       |
  459|   638k|        const __m256i res_lo_round =
  460|   638k|            _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
  461|       |
  462|   638k|        const __m256i res_lo_unsigned =
  463|   638k|            _mm256_add_epi16(res_lo_round, offset_const_2);
  464|       |
  465|   638k|        if (w - j < 16) {
  ------------------
  |  Branch (465:13): [True: 118k, False: 519k]
  ------------------
  466|   118k|          if (do_average) {
  ------------------
  |  Branch (466:15): [True: 57.9k, False: 60.4k]
  ------------------
  467|  57.9k|            const __m256i data_ref_0 =
  468|  57.9k|                load_line2_avx2(&dst[i * dst_stride + j],
  469|  57.9k|                                &dst[i * dst_stride + j + dst_stride]);
  470|  57.9k|            const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned,
  471|  57.9k|                                                  &wt, use_dist_wtd_comp_avg);
  472|       |
  473|  57.9k|            const __m256i round_result = convolve_rounding(
  474|  57.9k|                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
  475|       |
  476|  57.9k|            const __m256i res_8 =
  477|  57.9k|                _mm256_packus_epi16(round_result, round_result);
  478|  57.9k|            const __m128i res_0 = _mm256_castsi256_si128(res_8);
  479|  57.9k|            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
  480|       |
  481|  57.9k|            if (w - j > 4) {
  ------------------
  |  Branch (481:17): [True: 46.5k, False: 11.4k]
  ------------------
  482|  46.5k|              _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  483|  46.5k|              _mm_storel_epi64(
  484|  46.5k|                  (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])),
  485|  46.5k|                  res_1);
  486|  46.5k|            } else {
  487|  11.4k|              *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
  488|  11.4k|              *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
  489|  11.4k|                  _mm_cvtsi128_si32(res_1);
  490|  11.4k|            }
  491|  60.4k|          } else {
  492|  60.4k|            const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
  493|  60.4k|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
  494|       |
  495|  60.4k|            const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
  496|  60.4k|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  497|  60.4k|                            res_1);
  498|  60.4k|          }
  499|   519k|        } else {
  500|   519k|          __m256i res_hi = convolve_lowbd(s + 4, coeffs);
  501|       |
  502|   519k|          res_hi = _mm256_add_epi16(res_hi, offset_const_1);
  503|       |
  504|   519k|          const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
  505|   519k|          const __m256i res_hi_0_shift =
  506|   519k|              _mm256_slli_epi32(res_hi_0_32b, left_shift);
  507|   519k|          const __m256i res_hi_0_round = _mm256_sra_epi32(
  508|   519k|              _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
  509|       |
  510|   519k|          const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
  511|   519k|          const __m256i res_hi_1_shift =
  512|   519k|              _mm256_slli_epi32(res_hi_1_32b, left_shift);
  513|   519k|          const __m256i res_hi_1_round = _mm256_sra_epi32(
  514|   519k|              _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
  515|       |
  516|   519k|          const __m256i res_hi_round =
  517|   519k|              _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
  518|       |
  519|   519k|          const __m256i res_hi_unsigned =
  520|   519k|              _mm256_add_epi16(res_hi_round, offset_const_2);
  521|       |
  522|   519k|          if (do_average) {
  ------------------
  |  Branch (522:15): [True: 235k, False: 284k]
  ------------------
  523|   235k|            const __m256i data_ref_0_lo =
  524|   235k|                load_line2_avx2(&dst[i * dst_stride + j],
  525|   235k|                                &dst[i * dst_stride + j + dst_stride]);
  526|       |
  527|   235k|            const __m256i data_ref_0_hi =
  528|   235k|                load_line2_avx2(&dst[i * dst_stride + j + 8],
  529|   235k|                                &dst[i * dst_stride + j + 8 + dst_stride]);
  530|       |
  531|   235k|            const __m256i comp_avg_res_lo = comp_avg(
  532|   235k|                &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg);
  533|       |
  534|   235k|            const __m256i comp_avg_res_hi = comp_avg(
  535|   235k|                &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg);
  536|       |
  537|   235k|            const __m256i round_result_lo =
  538|   235k|                convolve_rounding(&comp_avg_res_lo, &offset_const,
  539|   235k|                                  &rounding_const, rounding_shift);
  540|       |
  541|   235k|            const __m256i round_result_hi =
  542|   235k|                convolve_rounding(&comp_avg_res_hi, &offset_const,
  543|   235k|                                  &rounding_const, rounding_shift);
  544|       |
  545|   235k|            const __m256i res_8 =
  546|   235k|                _mm256_packus_epi16(round_result_lo, round_result_hi);
  547|   235k|            const __m128i res_0 = _mm256_castsi256_si128(res_8);
  548|   235k|            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
  549|       |
  550|   235k|            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  551|   235k|            _mm_store_si128(
  552|   235k|                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
  553|       |
  554|   284k|          } else {
  555|   284k|            const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
  556|   284k|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
  557|       |
  558|   284k|            const __m128i res_lo_1 =
  559|   284k|                _mm256_extracti128_si256(res_lo_unsigned, 1);
  560|   284k|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  561|   284k|                            res_lo_1);
  562|       |
  563|   284k|            const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
  564|   284k|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]),
  565|   284k|                            res_hi_0);
  566|       |
  567|   284k|            const __m128i res_hi_1 =
  568|   284k|                _mm256_extracti128_si256(res_hi_unsigned, 1);
  569|   284k|            _mm_store_si128(
  570|   284k|                (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]),
  571|   284k|                res_hi_1);
  572|   284k|          }
  573|   519k|        }
  574|   638k|        s[0] = s[1];
  575|   638k|        s[1] = s[2];
  576|   638k|        s[2] = s[3];
  577|       |
  578|   638k|        s[4] = s[5];
  579|   638k|        s[5] = s[6];
  580|   638k|        s[6] = s[7];
  581|   638k|      }
  582|  59.6k|    }
  583|  44.1k|  }
  584|  79.3k|}
av1_dist_wtd_convolve_2d_avx2:
  591|   238k|                                   ConvolveParams *conv_params) {
  592|   238k|  CONV_BUF_TYPE *dst = conv_params->dst;
  593|   238k|  int dst_stride = conv_params->dst_stride;
  594|   238k|  const int bd = 8;
  595|       |
  596|   238k|  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
  ------------------
  |  |   19|   238k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
  597|       |
  598|   238k|  int im_stride = 8;
  599|   238k|  int i, is_horiz_4tap = 0, is_vert_4tap = 0;
  600|   238k|  const __m256i wt = unpack_weights_avx2(conv_params);
  601|   238k|  const int do_average = conv_params->do_average;
  602|   238k|  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
  603|   238k|  const int offset_0 =
  604|   238k|      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|   238k|#define FILTER_BITS 7
  ------------------
  605|   238k|  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
  606|   238k|  const __m256i offset_const = _mm256_set1_epi16(offset);
  607|   238k|  const int rounding_shift =
  608|   238k|      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|   238k|#define FILTER_BITS 7
  ------------------
  609|   238k|  const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
  610|       |
  611|   238k|  assert(conv_params->round_0 > 0);
  612|       |
  613|   238k|  const __m256i round_const_h = _mm256_set1_epi16(
  614|   238k|      ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
  ------------------
  |  |   21|   238k|#define FILTER_BITS 7
  ------------------
  615|   238k|  const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
  616|       |
  617|   238k|  const __m256i round_const_v = _mm256_set1_epi32(
  618|   238k|      ((1 << conv_params->round_1) >> 1) -
  619|   238k|      (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
  ------------------
  |  |   21|   238k|#define FILTER_BITS 7
  ------------------
  620|   238k|  const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
  621|       |
  622|   238k|  __m256i filt[4], coeffs_x[4], coeffs_y[4];
  623|       |
  624|   238k|  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
  625|   238k|  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
  626|       |
  627|   238k|  prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_x);
  628|   238k|  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
  629|       |
  630|       |  // Condition for checking valid horz_filt taps
  631|   238k|  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_x[0], coeffs_x[3]), 0)))
  ------------------
  |  Branch (631:7): [True: 107k, False: 130k]
  ------------------
  632|   107k|    is_horiz_4tap = 1;
  633|       |
  634|       |  // Condition for checking valid vert_filt taps
  635|   238k|  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_y[0], coeffs_y[3]), 0)))
  ------------------
  |  Branch (635:7): [True: 119k, False: 118k]
  ------------------
  636|   119k|    is_vert_4tap = 1;
  637|       |
  638|   238k|  if (is_horiz_4tap) {
  ------------------
  |  Branch (638:7): [True: 107k, False: 130k]
  ------------------
  639|   107k|    int im_h = h + filter_params_y->taps - 1;
  640|   107k|    const int fo_vert = filter_params_y->taps / 2 - 1;
  641|   107k|    const int fo_horiz = 1;
  642|   107k|    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
  643|   238k|    for (int j = 0; j < w; j += 8) {
  ------------------
  |  Branch (643:21): [True: 131k, False: 107k]
  ------------------
  644|       |      /* Horizontal filter */
  645|   131k|      const uint8_t *src_h = src_ptr + j;
  646|  1.46M|      for (i = 0; i < im_h; i += 2) {
  ------------------
  |  Branch (646:19): [True: 1.33M, False: 131k]
  ------------------
  647|  1.33M|        __m256i data =
  648|  1.33M|            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h));
  649|  1.33M|        if (i + 1 < im_h)
  ------------------
  |  Branch (649:13): [True: 1.20M, False: 131k]
  ------------------
  650|  1.20M|          data = _mm256_inserti128_si256(
  651|  1.33M|              data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1);
  652|  1.33M|        src_h += (src_stride << 1);
  653|  1.33M|        __m256i res = convolve_lowbd_x_4tap(data, coeffs_x + 1, filt);
  654|       |
  655|  1.33M|        res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
  656|  1.33M|                               round_shift_h);
  657|       |
  658|  1.33M|        _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
  659|  1.33M|      }
  660|   131k|      DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP;
  ------------------
  |  |  501|   131k|  do {                                                                         \
  |  |  502|   131k|    __m256i s[8];                                                              \
  |  |  503|   131k|    __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));    \
  |  |  504|   131k|    __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));    \
  |  |  505|   131k|    __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));    \
  |  |  506|   131k|    __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));    \
  |  |  507|   131k|    __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));    \
  |  |  508|   131k|    __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));    \
  |  |  509|   131k|                                                                               \
  |  |  510|   131k|    s[0] = _mm256_unpacklo_epi16(s0, s1);                                      \
  |  |  511|   131k|    s[1] = _mm256_unpacklo_epi16(s2, s3);                                      \
  |  |  512|   131k|    s[2] = _mm256_unpacklo_epi16(s4, s5);                                      \
  |  |  513|   131k|                                                                               \
  |  |  514|   131k|    s[4] = _mm256_unpackhi_epi16(s0, s1);                                      \
  |  |  515|   131k|    s[5] = _mm256_unpackhi_epi16(s2, s3);                                      \
  |  |  516|   131k|    s[6] = _mm256_unpackhi_epi16(s4, s5);                                      \
  |  |  517|   131k|                                                                               \
  |  |  518|   943k|    for (i = 0; i < h; i += 2) {                                               \
  |  |  ------------------
  |  |  |  Branch (518:17): [True: 812k, False: 131k]
  |  |  ------------------
  |  |  519|   812k|      const int16_t *data = &im_block[i * im_stride];                          \
  |  |  520|   812k|                                                                               \
  |  |  521|   812k|      const __m256i s6 =                                                       \
  |  |  522|   812k|          _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));               \
  |  |  523|   812k|      const __m256i s7 =                                                       \
  |  |  524|   812k|          _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));               \
  |  |  525|   812k|                                                                               \
  |  |  526|   812k|      s[3] = _mm256_unpacklo_epi16(s6, s7);                                    \
  |  |  527|   812k|      s[7] = _mm256_unpackhi_epi16(s6, s7);                                    \
  |  |  528|   812k|                                                                               \
  |  |  529|   812k|      const __m256i res_a = convolve(s, coeffs_y);                             \
  |  |  530|   812k|      const __m256i res_a_round = _mm256_sra_epi32(                            \
  |  |  531|   812k|          _mm256_add_epi32(res_a, round_const_v), round_shift_v);              \
  |  |  532|   812k|                                                                               \
  |  |  533|   812k|      if (w - j > 4) {                                                         \
  |  |  ------------------
  |  |  |  Branch (533:11): [True: 603k, False: 208k]
  |  |  ------------------
  |  |  534|   603k|        const __m256i res_b = convolve(s + 4, coeffs_y);                       \
  |  |  535|   603k|        const __m256i res_b_round = _mm256_sra_epi32(                          \
  |  |  536|   603k|            _mm256_add_epi32(res_b, round_const_v), round_shift_v);            \
  |  |  537|   603k|        const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);  \
  |  |  538|   603k|        const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);  \
  |  |  539|   603k|                                                                               \
  |  |  540|   603k|        if (do_average) {                                                      \
  |  |  ------------------
  |  |  |  Branch (540:13): [True: 373k, False: 230k]
  |  |  ------------------
  |  |  541|   373k|          const __m256i data_ref_0 =                                           \
  |  |  542|   373k|              load_line2_avx2(&dst[i * dst_stride + j],                        \
  |  |  543|   373k|                              &dst[i * dst_stride + j + dst_stride]);          \
  |  |  544|   373k|          const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,    \
  |  |  545|   373k|                                                &wt, use_dist_wtd_comp_avg);   \
  |  |  546|   373k|                                                                               \
  |  |  547|   373k|          const __m256i round_result = convolve_rounding(                      \
  |  |  548|   373k|              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);  \
  |  |  549|   373k|                                                                               \
  |  |  550|   373k|          const __m256i res_8 =                                                \
  |  |  551|   373k|              _mm256_packus_epi16(round_result, round_result);                 \
  |  |  552|   373k|          const __m128i res_0 = _mm256_castsi256_si128(res_8);                 \
  |  |  553|   373k|          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);            \
  |  |  554|   373k|                                                                               \
  |  |  555|   373k|          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);    \
  |  |  556|   373k|          _mm_storel_epi64(                                                    \
  |  |  557|   373k|              (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); \
  |  |  558|   373k|        } else {                                                               \
  |  |  559|   230k|          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);          \
  |  |  560|   230k|          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);       \
  |  |  561|   230k|                                                                               \
  |  |  562|   230k|          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);     \
  |  |  563|   230k|          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),  \
  |  |  564|   230k|                          res_1);                                              \
  |  |  565|   230k|        }                                                                      \
  |  |  566|   603k|      } else {                                                                 \
  |  |  567|   208k|        const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round);  \
  |  |  568|   208k|        const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);  \
  |  |  569|   208k|                                                                               \
  |  |  570|   208k|        if (do_average) {                                                      \
  |  |  ------------------
  |  |  |  Branch (570:13): [True: 107k, False: 101k]
  |  |  ------------------
  |  |  571|   107k|          const __m256i data_ref_0 =                                           \
  |  |  572|   107k|              load_line2_avx2(&dst[i * dst_stride + j],                        \
  |  |  573|   107k|                              &dst[i * dst_stride + j + dst_stride]);          \
  |  |  574|   107k|                                                                               \
  |  |  575|   107k|          const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,    \
  |  |  576|   107k|                                                &wt, use_dist_wtd_comp_avg);   \
  |  |  577|   107k|                                                                               \
  |  |  578|   107k|          const __m256i round_result = convolve_rounding(                      \
  |  |  579|   107k|              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);  \
  |  |  580|   107k|                                                                               \
  |  |  581|   107k|          const __m256i res_8 =                                                \
  |  |  582|   107k|              _mm256_packus_epi16(round_result, round_result);                 \
  |  |  583|   107k|          const __m128i res_0 = _mm256_castsi256_si128(res_8);                 \
  |  |  584|   107k|          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);            \
  |  |  585|   107k|                                                                               \
  |  |  586|   107k|          *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);     \
  |  |  587|   107k|          *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =                 \
  |  |  588|   107k|              _mm_cvtsi128_si32(res_1);                                        \
  |  |  589|   107k|                                                                               \
  |  |  590|   107k|        } else {                                                               \
  |  |  591|   101k|          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);          \
  |  |  592|   101k|          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);       \
  |  |  593|   101k|                                                                               \
  |  |  594|   101k|          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);     \
  |  |  595|   101k|          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),  \
  |  |  596|   101k|                          res_1);                                              \
  |  |  597|   101k|        }                                                                      \
  |  |  598|   208k|      }                                                                        \
  |  |  599|   812k|                                                                               \
  |  |  600|   812k|      s[0] = s[1];                                                             \
  |  |  601|   812k|      s[1] = s[2];                                                             \
  |  |  602|   812k|      s[2] = s[3];                                                             \
  |  |  603|   812k|                                                                               \
  |  |  604|   812k|      s[4] = s[5];                                                             \
  |  |  605|   812k|      s[5] = s[6];                                                             \
  |  |  606|   812k|      s[6] = s[7];                                                             \
  |  |  607|   812k|    }                                                                          \
  |  |  608|   131k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (608:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  661|   131k|    }
  662|   130k|  } else if (is_vert_4tap) {
  ------------------
  |  Branch (662:14): [True: 31.4k, False: 99.1k]
  ------------------
  663|  31.4k|    int im_h = h + 3;
  664|  31.4k|    const int fo_vert = 1;
  665|  31.4k|    const int fo_horiz = filter_params_x->taps / 2 - 1;
  666|  31.4k|    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
  667|       |
  668|  31.4k|    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
  669|  31.4k|    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
  670|       |
  671|  76.7k|    for (int j = 0; j < w; j += 8) {
  ------------------
  |  Branch (671:21): [True: 45.2k, False: 31.4k]
  ------------------
  672|       |      /* Horizontal filter */
  673|  45.2k|      const uint8_t *src_h = src_ptr + j;
  674|  45.2k|      DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP;
  ------------------
  |  |  483|  45.2k|  do {                                                                  \
  |  |  484|   280k|    for (i = 0; i < im_h; i += 2) {                                     \
  |  |  ------------------
  |  |  |  Branch (484:17): [True: 235k, False: 45.2k]
  |  |  ------------------
  |  |  485|   235k|      __m256i data =                                                    \
  |  |  486|   235k|          _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h));    \
  |  |  487|   235k|      if (i + 1 < im_h)                                                 \
  |  |  ------------------
  |  |  |  Branch (487:11): [True: 189k, False: 45.2k]
  |  |  ------------------
  |  |  488|   235k|        data = _mm256_inserti128_si256(                                 \
  |  |  489|   235k|            data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); \
  |  |  490|   235k|      src_h += (src_stride << 1);                                       \
  |  |  491|   235k|      __m256i res = convolve_lowbd_x(data, coeffs_x, filt);             \
  |  |  492|   235k|                                                                        \
  |  |  493|   235k|      res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),      \
  |  |  494|   235k|                             round_shift_h);                            \
  |  |  495|   235k|                                                                        \
  |  |  496|   235k|      _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);     \
  |  |  497|   235k|    }                                                                   \
  |  |  498|  45.2k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (498:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  675|       |
  676|       |      /* Vertical filter */
  677|  45.2k|      __m256i s[6];
  678|  45.2k|      __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
  679|  45.2k|      __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
  680|  45.2k|      __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
  681|  45.2k|      __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
  682|       |
  683|  45.2k|      s[0] = _mm256_unpacklo_epi16(s0, s1);
  684|  45.2k|      s[1] = _mm256_unpacklo_epi16(s2, s3);
  685|       |
  686|  45.2k|      s[3] = _mm256_unpackhi_epi16(s0, s1);
  687|  45.2k|      s[4] = _mm256_unpackhi_epi16(s2, s3);
  688|       |
  689|   189k|      for (i = 0; i < h; i += 2) {
  ------------------
  |  Branch (689:19): [True: 144k, False: 45.2k]
  ------------------
  690|   144k|        const int16_t *data = &im_block[i * im_stride];
  691|       |
  692|   144k|        const __m256i s4 =
  693|   144k|            _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
  694|   144k|        const __m256i s5 =
  695|   144k|            _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
  696|       |
  697|   144k|        s[2] = _mm256_unpacklo_epi16(s4, s5);
  698|   144k|        s[5] = _mm256_unpackhi_epi16(s4, s5);
  699|       |
  700|   144k|        const __m256i res_a = convolve_4tap(s, coeffs_y + 1);
  701|   144k|        const __m256i res_a_round = _mm256_sra_epi32(
  702|   144k|            _mm256_add_epi32(res_a, round_const_v), round_shift_v);
  703|       |
  704|   144k|        if (w - j > 4) {
  ------------------
  |  Branch (704:13): [True: 144k, False: 1]
  ------------------
  705|   144k|          const __m256i res_b = convolve_4tap(s + 3, coeffs_y + 1);
  706|   144k|          const __m256i res_b_round = _mm256_sra_epi32(
  707|   144k|              _mm256_add_epi32(res_b, round_const_v), round_shift_v);
  708|   144k|          const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);
  709|   144k|          const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
  710|       |
  711|   144k|          if (do_average) {
  ------------------
  |  Branch (711:15): [True: 59.6k, False: 84.7k]
  ------------------
  712|  59.6k|            const __m256i data_ref_0 =
  713|  59.6k|                load_line2_avx2(&dst[i * dst_stride + j],
  714|  59.6k|                                &dst[i * dst_stride + j + dst_stride]);
  715|  59.6k|            const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,
  716|  59.6k|                                                  &wt, use_dist_wtd_comp_avg);
  717|       |
  718|  59.6k|            const __m256i round_result = convolve_rounding(
  719|  59.6k|                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
  720|       |
  721|  59.6k|            const __m256i res_8 =
  722|  59.6k|                _mm256_packus_epi16(round_result, round_result);
  723|  59.6k|            const __m128i res_0 = _mm256_castsi256_si128(res_8);
  724|  59.6k|            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
  725|       |
  726|  59.6k|            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
  727|  59.6k|            _mm_storel_epi64(
  728|  59.6k|                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
  729|  84.7k|          } else {
  730|  84.7k|            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
  731|  84.7k|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
  732|       |
  733|  84.7k|            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
  734|  84.7k|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  735|  84.7k|                            res_1);
  736|  84.7k|          }
  737|   144k|        } else {
  738|      1|          const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round);
  739|      1|          const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
  740|       |
  741|      1|          if (do_average) {
  ------------------
  |  Branch (741:15): [True: 0, False: 1]
  ------------------
  742|      0|            const __m256i data_ref_0 =
  743|      0|                load_line2_avx2(&dst[i * dst_stride + j],
  744|      0|                                &dst[i * dst_stride + j + dst_stride]);
  745|       |
  746|      0|            const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,
  747|      0|                                                  &wt, use_dist_wtd_comp_avg);
  748|       |
  749|      0|            const __m256i round_result = convolve_rounding(
  750|      0|                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
  751|       |
  752|      0|            const __m256i res_8 =
  753|      0|                _mm256_packus_epi16(round_result, round_result);
  754|      0|            const __m128i res_0 = _mm256_castsi256_si128(res_8);
  755|      0|            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
  756|       |
  757|      0|            *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
  758|      0|            *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
  759|      0|                _mm_cvtsi128_si32(res_1);
  760|       |
  761|      1|          } else {
  762|      1|            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
  763|      1|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
  764|       |
  765|      1|            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
  766|      1|            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
  767|      1|                            res_1);
  768|      1|          }
  769|      1|        }
  770|   144k|        s[0] = s[1];
  771|   144k|        s[1] = s[2];
  772|   144k|        s[3] = s[4];
  773|   144k|        s[4] = s[5];
  774|   144k|      }
  775|  45.2k|    }
  776|  99.1k|  } else {
  777|  99.1k|    int im_h = h + filter_params_y->taps - 1;
  778|  99.1k|    const int fo_vert = filter_params_y->taps / 2 - 1;
  779|  99.1k|    const int fo_horiz = filter_params_x->taps / 2 - 1;
  780|  99.1k|    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
  781|       |
  782|  99.1k|    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
  783|  99.1k|    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
  784|       |
  785|   335k|    for (int j = 0; j < w; j += 8) {
  ------------------
  |  Branch (785:21): [True: 236k, False: 99.1k]
  ------------------
  786|       |      /* Horizontal filter */
  787|   236k|      const uint8_t *src_h = src_ptr + j;
  788|   236k|      DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP;
  ------------------
  |  |  483|   236k|  do {                                                                  \
  |  |  484|  4.21M|    for (i = 0; i < im_h; i += 2) {                                     \
  |  |  ------------------
  |  |  |  Branch (484:17): [True: 3.98M, False: 236k]
  |  |  ------------------
  |  |  485|  3.98M|      __m256i data =                                                    \
  |  |  486|  3.98M|          _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h));    \
  |  |  487|  3.98M|      if (i + 1 < im_h)                                                 \
  |  |  ------------------
  |  |  |  Branch (487:11): [True: 3.73M, False: 243k]
  |  |  ------------------
  |  |  488|  3.98M|        data = _mm256_inserti128_si256(                                 \
  |  |  489|  3.98M|            data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); \
  |  |  490|  3.98M|      src_h += (src_stride << 1);                                       \
  |  |  491|  3.98M|      __m256i res = convolve_lowbd_x(data, coeffs_x, filt);             \
  |  |  492|  3.98M|                                                                        \
  |  |  493|  3.98M|      res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),      \
  |  |  494|  3.98M|                             round_shift_h);                            \
  |  |  495|  3.98M|                                                                        \
  |  |  496|  3.98M|      _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);     \
  |  |  497|  3.98M|    }                                                                   \
  |  |  498|   236k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (498:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  789|       |
  790|   236k|      DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP;
  ------------------
  |  |  501|   236k|  do {                                                                         \
  |  |  502|   236k|    __m256i s[8];                                                              \
  |  |  503|   236k|    __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));    \
  |  |  504|   236k|    __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));    \
  |  |  505|   236k|    __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));    \
  |  |  506|   236k|    __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));    \
  |  |  507|   236k|    __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));    \
  |  |  508|   236k|    __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));    \
  |  |  509|   236k|                                                                               \
  |  |  510|   236k|    s[0] = _mm256_unpacklo_epi16(s0, s1);                                      \
  |  |  511|   236k|    s[1] = _mm256_unpacklo_epi16(s2, s3);                                      \
  |  |  512|   236k|    s[2] = _mm256_unpacklo_epi16(s4, s5);                                      \
  |  |  513|   236k|                                                                               \
  |  |  514|   236k|    s[4] = _mm256_unpackhi_epi16(s0, s1);                                      \
  |  |  515|   236k|    s[5] = _mm256_unpackhi_epi16(s2, s3);                                      \
  |  |  516|   236k|    s[6] = _mm256_unpackhi_epi16(s4, s5);                                      \
  |  |  517|   236k|                                                                               \
  |  |  518|  2.91M|    for (i = 0; i < h; i += 2) {                                               \
  |  |  ------------------
  |  |  |  Branch (518:17): [True: 2.68M, False: 236k]
  |  |  ------------------
  |  |  519|  2.68M|      const int16_t *data = &im_block[i * im_stride];                          \
  |  |  520|  2.68M|                                                                               \
  |  |  521|  2.68M|      const __m256i s6 =                                                       \
  |  |  522|  2.68M|          _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));               \
  |  |  523|  2.68M|      const __m256i s7 =                                                       \
  |  |  524|  2.68M|          _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));               \
  |  |  525|  2.68M|                                                                               \
  |  |  526|  2.68M|      s[3] = _mm256_unpacklo_epi16(s6, s7);                                    \
  |  |  527|  2.68M|      s[7] = _mm256_unpackhi_epi16(s6, s7);                                    \
  |  |  528|  2.68M|                                                                               \
  |  |  529|  2.68M|      const __m256i res_a = convolve(s, coeffs_y);                             \
  |  |  530|  2.68M|      const __m256i res_a_round = _mm256_sra_epi32(                            \
  |  |  531|  2.68M|          _mm256_add_epi32(res_a, round_const_v), round_shift_v);              \
  |  |  532|  2.68M|                                                                               \
  |  |  533|  2.68M|      if (w - j > 4) {                                                         \
  |  |  ------------------
  |  |  |  Branch (533:11): [True: 2.68M, False: 18.4E]
  |  |  ------------------
  |  |  534|  2.68M|        const __m256i res_b = convolve(s + 4, coeffs_y);                       \
  |  |  535|  2.68M|        const __m256i res_b_round = _mm256_sra_epi32(                          \
  |  |  536|  2.68M|            _mm256_add_epi32(res_b, round_const_v), round_shift_v);            \
  |  |  537|  2.68M|        const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);  \
  |  |  538|  2.68M|        const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);  \
  |  |  539|  2.68M|                                                                               \
  |  |  540|  2.68M|        if (do_average) {                                                      \
  |  |  ------------------
  |  |  |  Branch (540:13): [True: 1.14M, False: 1.54M]
  |  |  ------------------
  |  |  541|  1.14M|          const __m256i data_ref_0 =                                           \
  |  |  542|  1.14M|              load_line2_avx2(&dst[i * dst_stride + j],                        \
  |  |  543|  1.14M|                              &dst[i * dst_stride + j + dst_stride]);          \
  |  |  544|  1.14M|          const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,    \
  |  |  545|  1.14M|                                                &wt, use_dist_wtd_comp_avg);   \
  |  |  546|  1.14M|                                                                               \
  |  |  547|  1.14M|          const __m256i round_result = convolve_rounding(                      \
  |  |  548|  1.14M|              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);  \
  |  |  549|  1.14M|                                                                               \
  |  |  550|  1.14M|          const __m256i res_8 =                                                \
  |  |  551|  1.14M|              _mm256_packus_epi16(round_result, round_result);                 \
  |  |  552|  1.14M|          const __m128i res_0 = _mm256_castsi256_si128(res_8);                 \
  |  |  553|  1.14M|          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);            \
  |  |  554|  1.14M|                                                                               \
  |  |  555|  1.14M|          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);    \
  |  |  556|  1.14M|          _mm_storel_epi64(                                                    \
  |  |  557|  1.14M|              (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); \
  |  |  558|  1.54M|        } else {                                                               \
  |  |  559|  1.54M|          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);          \
  |  |  560|  1.54M|          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);       \
  |  |  561|  1.54M|                                                                               \
  |  |  562|  1.54M|          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);     \
  |  |  563|  1.54M|          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),  \
  |  |  564|  1.54M|                          res_1);                                              \
  |  |  565|  1.54M|        }                                                                      \
  |  |  566|  18.4E|      } else {                                                                 \
  |  |  567|  18.4E|        const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round);  \
  |  |  568|  18.4E|        const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);  \
  |  |  569|  18.4E|                                                                               \
  |  |  570|  18.4E|        if (do_average) {                                                      \
  |  |  ------------------
  |  |  |  Branch (570:13): [True: 0, False: 18.4E]
  |  |  ------------------
  |  |  571|      0|          const __m256i data_ref_0 =                                           \
  |  |  572|      0|              load_line2_avx2(&dst[i * dst_stride + j],                        \
  |  |  573|      0|                              &dst[i * dst_stride + j + dst_stride]);          \
  |  |  574|      0|                                                                               \
  |  |  575|      0|          const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,    \
  |  |  576|      0|                                                &wt, use_dist_wtd_comp_avg);   \
  |  |  577|      0|                                                                               \
  |  |  578|      0|          const __m256i round_result = convolve_rounding(                      \
  |  |  579|      0|              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);  \
  |  |  580|      0|                                                                               \
  |  |  581|      0|          const __m256i res_8 =                                                \
  |  |  582|      0|              _mm256_packus_epi16(round_result, round_result);                 \
  |  |  583|      0|          const __m128i res_0 = _mm256_castsi256_si128(res_8);                 \
  |  |  584|      0|          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);            \
  |  |  585|      0|                                                                               \
  |  |  586|      0|          *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);     \
  |  |  587|      0|          *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =                 \
  |  |  588|      0|              _mm_cvtsi128_si32(res_1);                                        \
  |  |  589|      0|                                                                               \
  |  |  590|  18.4E|        } else {                                                               \
  |  |  591|  18.4E|          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);          \
  |  |  592|  18.4E|          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);       \
  |  |  593|  18.4E|                                                                               \
  |  |  594|  18.4E|          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);     \
  |  |  595|  18.4E|          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),  \
  |  |  596|  18.4E|                          res_1);                                              \
  |  |  597|  18.4E|        }                                                                      \
  |  |  598|  18.4E|      }                                                                        \
  |  |  599|  2.68M|                                                                               \
  |  |  600|  2.68M|      s[0] = s[1];                                                             \
  |  |  601|  2.68M|      s[1] = s[2];                                                             \
  |  |  602|  2.68M|      s[2] = s[3];                                                             \
  |  |  603|  2.68M|                                                                               \
  |  |  604|  2.68M|      s[4] = s[5];                                                             \
  |  |  605|  2.68M|      s[5] = s[6];                                                             \
  |  |  606|  2.68M|      s[6] = s[7];                                                             \
  |  |  607|  2.68M|    }                                                                          \
  |  |  608|   236k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (608:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  791|   236k|    }
  792|  99.1k|  }
  793|   238k|}
av1_dist_wtd_convolve_2d_copy_avx2:
 1091|   975k|                                        int h, ConvolveParams *conv_params) {
 1092|   975k|  const int bd = 8;
 1093|   975k|  CONV_BUF_TYPE *dst = conv_params->dst;
 1094|   975k|  int dst_stride = conv_params->dst_stride;
 1095|   975k|  assert(conv_params->round_0 == 3);
 1096|   975k|  assert(conv_params->round_1 == 7);
 1097|   975k|  assert(w % 4 == 0);
 1098|   975k|  assert(h % 4 == 0);
 1099|       |
 1100|   975k|  const int do_average = conv_params->do_average;
 1101|   975k|  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
 1102|   975k|  const __m256i wt = unpack_weights_avx2(conv_params);
 1103|   975k|  const __m256i zero = _mm256_setzero_si256();
 1104|       |
 1105|   975k|  const int offset_0 =
 1106|   975k|      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|   975k|#define FILTER_BITS 7
  ------------------
 1107|   975k|  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
 1108|   975k|  const __m256i offset_const = _mm256_set1_epi16(offset);
 1109|   975k|  const int rounding_shift =
 1110|   975k|      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|   975k|#define FILTER_BITS 7
  ------------------
 1111|   975k|  const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
 1112|       |
 1113|   975k|  if (do_average) {
  ------------------
  |  Branch (1113:7): [True: 441k, False: 534k]
  ------------------
 1114|   441k|    if (use_dist_wtd_comp_avg) {
  ------------------
  |  Branch (1114:9): [True: 74.3k, False: 366k]
  ------------------
 1115|  74.3k|      DO_AVG_2D_COPY(1)
  ------------------
  |  |  957|  74.3k|  int i = h;                                                                  \
  |  |  958|  74.3k|  if (w >= 16) {                                                              \
  |  |  ------------------
  |  |  |  Branch (958:7): [True: 26.4k, False: 47.8k]
  |  |  ------------------
  |  |  959|  26.4k|    __m256i src_0, src_1, src_2, src_3;                                       \
  |  |  960|  26.4k|    __m256i ref_0, ref_1, ref_2, ref_3;                                       \
  |  |  961|  26.4k|    __m256i res_0, res_1, res_2, res_3;                                       \
  |  |  962|  26.4k|    __m256i res_10, res_32;                                                   \
  |  |  963|  26.4k|    if (w == 128) {                                                           \
  |  |  ------------------
  |  |  |  Branch (963:9): [True: 242, False: 26.2k]
  |  |  ------------------
  |  |  964|  29.3k|      do {                                                                    \
  |  |  965|  29.3k|        DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48);    \
  |  |  ------------------
  |  |  |  |  903|  29.3k|  do {                                                                         \
  |  |  |  |  904|  29.3k|    src_0 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  905|  29.3k|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));             \
  |  |  |  |  906|  29.3k|    src_1 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  907|  29.3k|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));             \
  |  |  |  |  908|  29.3k|    src_2 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  909|  29.3k|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));             \
  |  |  |  |  910|  29.3k|    src_3 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  911|  29.3k|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));             \
  |  |  |  |  912|  29.3k|                                                                               \
  |  |  |  |  913|  29.3k|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  29.3k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  29.3k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  914|  29.3k|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  29.3k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  29.3k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  915|  29.3k|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  29.3k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  29.3k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  916|  29.3k|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  29.3k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  29.3k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  917|  29.3k|    src_0 = _mm256_add_epi16(src_0, offset_const);                             \
  |  |  |  |  918|  29.3k|    src_1 = _mm256_add_epi16(src_1, offset_const);                             \
  |  |  |  |  919|  29.3k|    src_2 = _mm256_add_epi16(src_2, offset_const);                             \
  |  |  |  |  920|  29.3k|    src_3 = _mm256_add_epi16(src_3, offset_const);                             \
  |  |  |  |  921|  29.3k|                                                                               \
  |  |  |  |  922|  29.3k|    ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0]));       \
  |  |  |  |  923|  29.3k|    ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1]));       \
  |  |  |  |  924|  29.3k|    ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2]));       \
  |  |  |  |  925|  29.3k|    ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3]));       \
  |  |  |  |  926|  29.3k|                                                                               \
  |  |  |  |  927|  29.3k|    res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  928|  29.3k|    res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  929|  29.3k|    res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  930|  29.3k|    res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  931|  29.3k|                                                                               \
  |  |  |  |  932|  29.3k|    res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const,          \
  |  |  |  |  933|  29.3k|                              rounding_shift);                                 \
  |  |  |  |  934|  29.3k|    res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const,          \
  |  |  |  |  935|  29.3k|                              rounding_shift);                                 \
  |  |  |  |  936|  29.3k|    res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const,          \
  |  |  |  |  937|  29.3k|                              rounding_shift);                                 \
  |  |  |  |  938|  29.3k|    res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const,          \
  |  |  |  |  939|  29.3k|                              rounding_shift);                                 \
  |  |  |  |  940|  29.3k|                                                                               \
  |  |  |  |  941|  29.3k|    res_10 = _mm256_packus_epi16(res_0, res_1);                                \
  |  |  |  |  942|  29.3k|    res_32 = _mm256_packus_epi16(res_2, res_3);                                \
  |  |  |  |  943|  29.3k|    res_10 = _mm256_permute4x64_epi64(res_10, 0xD8);                           \
  |  |  |  |  944|  29.3k|    res_32 = _mm256_permute4x64_epi64(res_32, 0xD8);                           \
  |  |  |  |  945|  29.3k|                                                                               \
  |  |  |  |  946|  29.3k|    _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]),                 \
  |  |  |  |  947|  29.3k|                    _mm256_castsi256_si128(res_10));                           \
  |  |  |  |  948|  29.3k|    _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]),                 \
  |  |  |  |  949|  29.3k|                    _mm256_extracti128_si256(res_10, 1));                      \
  |  |  |  |  950|  29.3k|    _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]),                 \
  |  |  |  |  951|  29.3k|                    _mm256_castsi256_si128(res_32));                           \
  |  |  |  |  952|  29.3k|    _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]),                 \
  |  |  |  |  953|  29.3k|                    _mm256_extracti128_si256(res_32, 1));                      \
  |  |  |  |  954|  29.3k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (954:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  966|  29.3k|        DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 64, 0, 80, 0, 96, 0, 112);  \
  |  |  ------------------
  |  |  |  |  903|  29.3k|  do {                                                                         \
  |  |  |  |  904|  29.3k|    src_0 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  905|  29.3k|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));             \
  |  |  |  |  906|  29.3k|    src_1 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  907|  29.3k|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));             \
  |  |  |  |  908|  29.3k|    src_2 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  909|  29.3k|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));             \
  |  |  |  |  910|  29.3k|    src_3 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  911|  29.3k|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));             \
  |  |  |  |  912|  29.3k|                                                                               \
  |  |  |  |  913|  29.3k|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  29.3k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  29.3k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  914|  29.3k|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  29.3k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  29.3k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  915|  29.3k|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  29.3k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  29.3k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  916|  29.3k|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  29.3k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  29.3k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  917|  29.3k|    src_0 = _mm256_add_epi16(src_0, offset_const);                             \
  |  |  |  |  918|  29.3k|    src_1 = _mm256_add_epi16(src_1, offset_const);                             \
  |  |  |  |  919|  29.3k|    src_2 = _mm256_add_epi16(src_2, offset_const);                             \
  |  |  |  |  920|  29.3k|    src_3 = _mm256_add_epi16(src_3, offset_const);                             \
  |  |  |  |  921|  29.3k|                                                                               \
  |  |  |  |  922|  29.3k|    ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0]));       \
  |  |  |  |  923|  29.3k|    ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1]));       \
  |  |  |  |  924|  29.3k|    ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2]));       \
  |  |  |  |  925|  29.3k|    ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3]));       \
  |  |  |  |  926|  29.3k|                                                                               \
  |  |  |  |  927|  29.3k|    res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  928|  29.3k|    res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  929|  29.3k|    res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  930|  29.3k|    res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  931|  29.3k|                                                                               \
  |  |  |  |  932|  29.3k|    res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const,          \
  |  |  |  |  933|  29.3k|                              rounding_shift);                                 \
  |  |  |  |  934|  29.3k|    res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const,          \
  |  |  |  |  935|  29.3k|                              rounding_shift);                                 \
  |  |  |  |  936|  29.3k|    res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const,          \
  |  |  |  |  937|  29.3k|                              rounding_shift);                                 \
  |  |  |  |  938|  29.3k|    res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const,          \
  |  |  |  |  939|  29.3k|                              rounding_shift);                                 \
  |  |  |  |  940|  29.3k|                                                                               \
  |  |  |  |  941|  29.3k|    res_10 = _mm256_packus_epi16(res_0, res_1);                                \
  |  |  |  |  942|  29.3k|    res_32 = _mm256_packus_epi16(res_2, res_3);                                \
  |  |  |  |  943|  29.3k|    res_10 = _mm256_permute4x64_epi64(res_10, 0xD8);                           \
  |  |  |  |  944|  29.3k|    res_32 = _mm256_permute4x64_epi64(res_32, 0xD8);                           \
  |  |  |  |  945|  29.3k|                                                                               \
  |  |  |  |  946|  29.3k|    _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]),                 \
  |  |  |  |  947|  29.3k|                    _mm256_castsi256_si128(res_10));                           \
  |  |  |  |  948|  29.3k|    _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]),                 \
  |  |  |  |  949|  29.3k|                    _mm256_extracti128_si256(res_10, 1));                      \
  |  |  |  |  950|  29.3k|    _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]),                 \
  |  |  |  |  951|  29.3k|                    _mm256_castsi256_si128(res_32));                           \
  |  |  |  |  952|  29.3k|    _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]),                 \
  |  |  |  |  953|  29.3k|                    _mm256_extracti128_si256(res_32, 1));                      \
  |  |  |  |  954|  29.3k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (954:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  967|  29.3k|        i -= 1;                                                               \
  |  |  968|  29.3k|        src += 1 * src_stride;                                                \
  |  |  969|  29.3k|        dst += 1 * dst_stride;                                                \
  |  |  970|  29.3k|        dst0 += 1 * dst_stride0;                                              \
  |  |  971|  29.3k|      } while (i);                                                            \
  |  |  ------------------
  |  |  |  Branch (971:16): [True: 29.1k, False: 242]
  |  |  ------------------
  |  |  972|  26.2k|    } else if (w == 64) {                                                     \
  |  |  ------------------
  |  |  |  Branch (972:16): [True: 1.43k, False: 24.8k]
  |  |  ------------------
  |  |  973|  83.6k|      do {                                                                    \
  |  |  974|  83.6k|        DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48);    \
  |  |  ------------------
  |  |  |  |  903|  83.6k|  do {                                                                         \
  |  |  |  |  904|  83.6k|    src_0 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  905|  83.6k|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));             \
  |  |  |  |  906|  83.6k|    src_1 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  907|  83.6k|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));             \
  |  |  |  |  908|  83.6k|    src_2 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  909|  83.6k|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));             \
  |  |  |  |  910|  83.6k|    src_3 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  911|  83.6k|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));             \
  |  |  |  |  912|  83.6k|                                                                               \
  |  |  |  |  913|  83.6k|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  83.6k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  83.6k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  914|  83.6k|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  83.6k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  83.6k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  915|  83.6k|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  83.6k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  83.6k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  916|  83.6k|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  83.6k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  83.6k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  917|  83.6k|    src_0 = _mm256_add_epi16(src_0, offset_const);                             \
  |  |  |  |  918|  83.6k|    src_1 = _mm256_add_epi16(src_1, offset_const);                             \
  |  |  |  |  919|  83.6k|    src_2 = _mm256_add_epi16(src_2, offset_const);                             \
  |  |  |  |  920|  83.6k|    src_3 = _mm256_add_epi16(src_3, offset_const);                             \
  |  |  |  |  921|  83.6k|                                                                               \
  |  |  |  |  922|  83.6k|    ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0]));       \
  |  |  |  |  923|  83.6k|    ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1]));       \
  |  |  |  |  924|  83.6k|    ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2]));       \
  |  |  |  |  925|  83.6k|    ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3]));       \
  |  |  |  |  926|  83.6k|                                                                               \
  |  |  |  |  927|  83.6k|    res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  928|  83.6k|    res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  929|  83.6k|    res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  930|  83.6k|    res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  931|  83.6k|                                                                               \
  |  |  |  |  932|  83.6k|    res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const,          \
  |  |  |  |  933|  83.6k|                              rounding_shift);                                 \
  |  |  |  |  934|  83.6k|    res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const,          \
  |  |  |  |  935|  83.6k|                              rounding_shift);                                 \
  |  |  |  |  936|  83.6k|    res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const,          \
  |  |  |  |  937|  83.6k|                              rounding_shift);                                 \
  |  |  |  |  938|  83.6k|    res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const,          \
  |  |  |  |  939|  83.6k|                              rounding_shift);                                 \
  |  |  |  |  940|  83.6k|                                                                               \
  |  |  |  |  941|  83.6k|    res_10 = _mm256_packus_epi16(res_0, res_1);                                \
  |  |  |  |  942|  83.6k|    res_32 = _mm256_packus_epi16(res_2, res_3);                                \
  |  |  |  |  943|  83.6k|    res_10 = _mm256_permute4x64_epi64(res_10, 0xD8);                           \
  |  |  |  |  944|  83.6k|    res_32 = _mm256_permute4x64_epi64(res_32, 0xD8);                           \
  |  |  |  |  945|  83.6k|                                                                               \
  |  |  |  |  946|  83.6k|    _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]),                 \
  |  |  |  |  947|  83.6k|                    _mm256_castsi256_si128(res_10));                           \
  |  |  |  |  948|  83.6k|    _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]),                 \
  |  |  |  |  949|  83.6k|                    _mm256_extracti128_si256(res_10, 1));                      \
  |  |  |  |  950|  83.6k|    _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]),                 \
  |  |  |  |  951|  83.6k|                    _mm256_castsi256_si128(res_32));                           \
  |  |  |  |  952|  83.6k|    _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]),                 \
  |  |  |  |  953|  83.6k|                    _mm256_extracti128_si256(res_32, 1));                      \
  |  |  |  |  954|  83.6k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (954:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  975|  83.6k|                                                                              \
  |  |  976|  83.6k|        i -= 1;                                                               \
  |  |  977|  83.6k|        src += 1 * src_stride;                                                \
  |  |  978|  83.6k|        dst += 1 * dst_stride;                                                \
  |  |  979|  83.6k|        dst0 += 1 * dst_stride0;                                              \
  |  |  980|  83.6k|      } while (i);                                                            \
  |  |  ------------------
  |  |  |  Branch (980:16): [True: 82.2k, False: 1.43k]
  |  |  ------------------
  |  |  981|  24.8k|    } else if (w == 32) {                                                     \
  |  |  ------------------
  |  |  |  Branch (981:16): [True: 6.27k, False: 18.5k]
  |  |  ------------------
  |  |  982|  72.6k|      do {                                                                    \
  |  |  983|  72.6k|        DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 0, 16, 1, 16);     \
  |  |  ------------------
  |  |  |  |  903|  72.6k|  do {                                                                         \
  |  |  |  |  904|  72.6k|    src_0 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  905|  72.6k|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));             \
  |  |  |  |  906|  72.6k|    src_1 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  907|  72.6k|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));             \
  |  |  |  |  908|  72.6k|    src_2 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  909|  72.6k|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));             \
  |  |  |  |  910|  72.6k|    src_3 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  911|  72.6k|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));             \
  |  |  |  |  912|  72.6k|                                                                               \
  |  |  |  |  913|  72.6k|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  72.6k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  72.6k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  914|  72.6k|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  72.6k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  72.6k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  915|  72.6k|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  72.6k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  72.6k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  916|  72.6k|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  72.6k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  72.6k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  917|  72.6k|    src_0 = _mm256_add_epi16(src_0, offset_const);                             \
  |  |  |  |  918|  72.6k|    src_1 = _mm256_add_epi16(src_1, offset_const);                             \
  |  |  |  |  919|  72.6k|    src_2 = _mm256_add_epi16(src_2, offset_const);                             \
  |  |  |  |  920|  72.6k|    src_3 = _mm256_add_epi16(src_3, offset_const);                             \
  |  |  |  |  921|  72.6k|                                                                               \
  |  |  |  |  922|  72.6k|    ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0]));       \
  |  |  |  |  923|  72.6k|    ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1]));       \
  |  |  |  |  924|  72.6k|    ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2]));       \
  |  |  |  |  925|  72.6k|    ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3]));       \
  |  |  |  |  926|  72.6k|                                                                               \
  |  |  |  |  927|  72.6k|    res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  928|  72.6k|    res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  929|  72.6k|    res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  930|  72.6k|    res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  931|  72.6k|                                                                               \
  |  |  |  |  932|  72.6k|    res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const,          \
  |  |  |  |  933|  72.6k|                              rounding_shift);                                 \
  |  |  |  |  934|  72.6k|    res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const,          \
  |  |  |  |  935|  72.6k|                              rounding_shift);                                 \
  |  |  |  |  936|  72.6k|    res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const,          \
  |  |  |  |  937|  72.6k|                              rounding_shift);                                 \
  |  |  |  |  938|  72.6k|    res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const,          \
  |  |  |  |  939|  72.6k|                              rounding_shift);                                 \
  |  |  |  |  940|  72.6k|                                                                               \
  |  |  |  |  941|  72.6k|    res_10 = _mm256_packus_epi16(res_0, res_1);                                \
  |  |  |  |  942|  72.6k|    res_32 = _mm256_packus_epi16(res_2, res_3);                                \
  |  |  |  |  943|  72.6k|    res_10 = _mm256_permute4x64_epi64(res_10, 0xD8);                           \
  |  |  |  |  944|  72.6k|    res_32 = _mm256_permute4x64_epi64(res_32, 0xD8);                           \
  |  |  |  |  945|  72.6k|                                                                               \
  |  |  |  |  946|  72.6k|    _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]),                 \
  |  |  |  |  947|  72.6k|                    _mm256_castsi256_si128(res_10));                           \
  |  |  |  |  948|  72.6k|    _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]),                 \
  |  |  |  |  949|  72.6k|                    _mm256_extracti128_si256(res_10, 1));                      \
  |  |  |  |  950|  72.6k|    _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]),                 \
  |  |  |  |  951|  72.6k|                    _mm256_castsi256_si128(res_32));                           \
  |  |  |  |  952|  72.6k|    _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]),                 \
  |  |  |  |  953|  72.6k|                    _mm256_extracti128_si256(res_32, 1));                      \
  |  |  |  |  954|  72.6k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (954:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  984|  72.6k|                                                                              \
  |  |  985|  72.6k|        i -= 2;                                                               \
  |  |  986|  72.6k|        src += 2 * src_stride;                                                \
  |  |  987|  72.6k|        dst += 2 * dst_stride;                                                \
  |  |  988|  72.6k|        dst0 += 2 * dst_stride0;                                              \
  |  |  989|  72.6k|      } while (i);                                                            \
  |  |  ------------------
  |  |  |  Branch (989:16): [True: 66.3k, False: 6.27k]
  |  |  ------------------
  |  |  990|  18.5k|    } else {                                                                  \
  |  |  991|  18.5k|      assert(w == 16);                                                        \
  |  |  992|  65.5k|      do {                                                                    \
  |  |  993|  65.5k|        DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 2, 0, 3, 0);       \
  |  |  ------------------
  |  |  |  |  903|  65.5k|  do {                                                                         \
  |  |  |  |  904|  65.5k|    src_0 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  905|  65.5k|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));             \
  |  |  |  |  906|  65.5k|    src_1 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  907|  65.5k|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));             \
  |  |  |  |  908|  65.5k|    src_2 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  909|  65.5k|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));             \
  |  |  |  |  910|  65.5k|    src_3 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  911|  65.5k|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));             \
  |  |  |  |  912|  65.5k|                                                                               \
  |  |  |  |  913|  65.5k|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  65.5k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  65.5k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  914|  65.5k|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  65.5k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  65.5k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  915|  65.5k|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  65.5k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  65.5k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  916|  65.5k|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  65.5k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  65.5k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  917|  65.5k|    src_0 = _mm256_add_epi16(src_0, offset_const);                             \
  |  |  |  |  918|  65.5k|    src_1 = _mm256_add_epi16(src_1, offset_const);                             \
  |  |  |  |  919|  65.5k|    src_2 = _mm256_add_epi16(src_2, offset_const);                             \
  |  |  |  |  920|  65.5k|    src_3 = _mm256_add_epi16(src_3, offset_const);                             \
  |  |  |  |  921|  65.5k|                                                                               \
  |  |  |  |  922|  65.5k|    ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0]));       \
  |  |  |  |  923|  65.5k|    ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1]));       \
  |  |  |  |  924|  65.5k|    ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2]));       \
  |  |  |  |  925|  65.5k|    ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3]));       \
  |  |  |  |  926|  65.5k|                                                                               \
  |  |  |  |  927|  65.5k|    res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  928|  65.5k|    res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  929|  65.5k|    res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  930|  65.5k|    res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  931|  65.5k|                                                                               \
  |  |  |  |  932|  65.5k|    res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const,          \
  |  |  |  |  933|  65.5k|                              rounding_shift);                                 \
  |  |  |  |  934|  65.5k|    res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const,          \
  |  |  |  |  935|  65.5k|                              rounding_shift);                                 \
  |  |  |  |  936|  65.5k|    res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const,          \
  |  |  |  |  937|  65.5k|                              rounding_shift);                                 \
  |  |  |  |  938|  65.5k|    res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const,          \
  |  |  |  |  939|  65.5k|                              rounding_shift);                                 \
  |  |  |  |  940|  65.5k|                                                                               \
  |  |  |  |  941|  65.5k|    res_10 = _mm256_packus_epi16(res_0, res_1);                                \
  |  |  |  |  942|  65.5k|    res_32 = _mm256_packus_epi16(res_2, res_3);                                \
  |  |  |  |  943|  65.5k|    res_10 = _mm256_permute4x64_epi64(res_10, 0xD8);                           \
  |  |  |  |  944|  65.5k|    res_32 = _mm256_permute4x64_epi64(res_32, 0xD8);                           \
  |  |  |  |  945|  65.5k|                                                                               \
  |  |  |  |  946|  65.5k|    _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]),                 \
  |  |  |  |  947|  65.5k|                    _mm256_castsi256_si128(res_10));                           \
  |  |  |  |  948|  65.5k|    _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]),                 \
  |  |  |  |  949|  65.5k|                    _mm256_extracti128_si256(res_10, 1));                      \
  |  |  |  |  950|  65.5k|    _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]),                 \
  |  |  |  |  951|  65.5k|                    _mm256_castsi256_si128(res_32));                           \
  |  |  |  |  952|  65.5k|    _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]),                 \
  |  |  |  |  953|  65.5k|                    _mm256_extracti128_si256(res_32, 1));                      \
  |  |  |  |  954|  65.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (954:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  994|  65.5k|                                                                              \
  |  |  995|  65.5k|        i -= 4;                                                               \
  |  |  996|  65.5k|        src += 4 * src_stride;                                                \
  |  |  997|  65.5k|        dst += 4 * dst_stride;                                                \
  |  |  998|  65.5k|        dst0 += 4 * dst_stride0;                                              \
  |  |  999|  65.5k|      } while (i);                                                            \
  |  |  ------------------
  |  |  |  Branch (999:16): [True: 47.0k, False: 18.5k]
  |  |  ------------------
  |  | 1000|  18.5k|    }                                                                         \
  |  | 1001|  47.8k|  } else if (w == 8) {                                                        \
  |  |  ------------------
  |  |  |  Branch (1001:14): [True: 28.8k, False: 18.9k]
  |  |  ------------------
  |  | 1002|  70.5k|    do {                                                                      \
  |  | 1003|  70.5k|      const __m128i src_0 =                                                   \
  |  | 1004|  70.5k|          _mm_loadl_epi64((__m128i *)(&src[0 * src_stride]));                 \
  |  | 1005|  70.5k|      const __m128i src_1 =                                                   \
  |  | 1006|  70.5k|          _mm_loadl_epi64((__m128i *)(&src[1 * src_stride]));                 \
  |  | 1007|  70.5k|      const __m128i src_2 =                                                   \
  |  | 1008|  70.5k|          _mm_loadl_epi64((__m128i *)(&src[2 * src_stride]));                 \
  |  | 1009|  70.5k|      const __m128i src_3 =                                                   \
  |  | 1010|  70.5k|          _mm_loadl_epi64((__m128i *)(&src[3 * src_stride]));                 \
  |  | 1011|  70.5k|      __m256i src_10 =                                                        \
  |  | 1012|  70.5k|          _mm256_insertf128_si256(_mm256_castsi128_si256(src_0), src_1, 1);   \
  |  | 1013|  70.5k|      __m256i src_32 =                                                        \
  |  | 1014|  70.5k|          _mm256_insertf128_si256(_mm256_castsi128_si256(src_2), src_3, 1);   \
  |  | 1015|  70.5k|                                                                              \
  |  | 1016|  70.5k|      src_10 = _mm256_unpacklo_epi8(src_10, zero);                            \
  |  | 1017|  70.5k|      src_32 = _mm256_unpacklo_epi8(src_32, zero);                            \
  |  | 1018|  70.5k|                                                                              \
  |  | 1019|  70.5k|      src_10 = _mm256_slli_epi16(src_10, LEFT_SHIFT);                         \
  |  |  ------------------
  |  |  |  |  822|  70.5k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  70.5k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 1020|  70.5k|      src_32 = _mm256_slli_epi16(src_32, LEFT_SHIFT);                         \
  |  |  ------------------
  |  |  |  |  822|  70.5k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  70.5k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 1021|  70.5k|                                                                              \
  |  | 1022|  70.5k|      src_10 = _mm256_add_epi16(src_10, offset_const);                        \
  |  | 1023|  70.5k|      src_32 = _mm256_add_epi16(src_32, offset_const);                        \
  |  | 1024|  70.5k|                                                                              \
  |  | 1025|  70.5k|      const __m256i ref_10 =                                                  \
  |  | 1026|  70.5k|          load_line2_avx2(&dst[0 * dst_stride], &dst[1 * dst_stride]);        \
  |  | 1027|  70.5k|      const __m256i ref_32 =                                                  \
  |  | 1028|  70.5k|          load_line2_avx2(&dst[2 * dst_stride], &dst[3 * dst_stride]);        \
  |  | 1029|  70.5k|      __m256i res_10 = comp_avg(&ref_10, &src_10, &wt, USE_DIST_WEIGHTED);    \
  |  | 1030|  70.5k|      __m256i res_32 = comp_avg(&ref_32, &src_32, &wt, USE_DIST_WEIGHTED);    \
  |  | 1031|  70.5k|                                                                              \
  |  | 1032|  70.5k|      res_10 = convolve_rounding(&res_10, &offset_const, &rounding_const,     \
  |  | 1033|  70.5k|                                 rounding_shift);                             \
  |  | 1034|  70.5k|      res_32 = convolve_rounding(&res_32, &offset_const, &rounding_const,     \
  |  | 1035|  70.5k|                                 rounding_shift);                             \
  |  | 1036|  70.5k|                                                                              \
  |  | 1037|  70.5k|      __m256i res = _mm256_packus_epi16(res_10, res_32);                      \
  |  | 1038|  70.5k|      const __m128i res_20 = _mm256_castsi256_si128(res);                     \
  |  | 1039|  70.5k|      const __m128i res_31 = _mm256_extracti128_si256(res, 1);                \
  |  | 1040|  70.5k|                                                                              \
  |  | 1041|  70.5k|      _mm_storel_epi64((__m128i *)(&dst0[0 * dst_stride0]), res_20);          \
  |  | 1042|  70.5k|      _mm_storel_epi64((__m128i *)((&dst0[1 * dst_stride0])), res_31);        \
  |  | 1043|  70.5k|      _mm_storeh_epi64((__m128i *)(&dst0[2 * dst_stride0]), res_20);          \
  |  | 1044|  70.5k|      _mm_storeh_epi64((__m128i *)((&dst0[3 * dst_stride0])), res_31);        \
  |  | 1045|  70.5k|      i -= 4;                                                                 \
  |  | 1046|  70.5k|      src += 4 * src_stride;                                                  \
  |  | 1047|  70.5k|      dst += 4 * dst_stride;                                                  \
  |  | 1048|  70.5k|      dst0 += 4 * dst_stride0;                                                \
  |  | 1049|  70.5k|    } while (i);                                                              \
  |  |  ------------------
  |  |  |  Branch (1049:14): [True: 41.7k, False: 28.8k]
  |  |  ------------------
  |  | 1050|  28.8k|  } else {                                                                    \
  |  | 1051|  18.9k|    assert(w == 4);                                                           \
  |  | 1052|  27.9k|    do {                                                                      \
  |  | 1053|  27.9k|      __m256i src_3210_8bit =                                                 \
  |  | 1054|  27.9k|          _mm256_setr_epi32(loadu_int32(src + 0 * src_stride),                \
  |  | 1055|  27.9k|                            loadu_int32(src + 1 * src_stride), 0, 0,          \
  |  | 1056|  27.9k|                            loadu_int32(src + 2 * src_stride),                \
  |  | 1057|  27.9k|                            loadu_int32(src + 3 * src_stride), 0, 0);         \
  |  | 1058|  27.9k|                                                                              \
  |  | 1059|  27.9k|      __m256i src_3210 = _mm256_unpacklo_epi8(src_3210_8bit, zero);           \
  |  | 1060|  27.9k|      src_3210 = _mm256_slli_epi16(src_3210, LEFT_SHIFT);                     \
  |  |  ------------------
  |  |  |  |  822|  27.9k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  27.9k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 1061|  27.9k|      src_3210 = _mm256_add_epi16(src_3210, offset_const);                    \
  |  | 1062|  27.9k|                                                                              \
  |  | 1063|  27.9k|      __m256i ref_3210 =                                                      \
  |  | 1064|  27.9k|          _mm256_setr_epi64x(*(int64_t *)(dst + 0 * dst_stride),              \
  |  | 1065|  27.9k|                             *(int64_t *)(dst + 1 * dst_stride),              \
  |  | 1066|  27.9k|                             *(int64_t *)(dst + 2 * dst_stride),              \
  |  | 1067|  27.9k|                             *(int64_t *)(dst + 3 * dst_stride));             \
  |  | 1068|  27.9k|      __m256i res_3210 =                                                      \
  |  | 1069|  27.9k|          comp_avg(&ref_3210, &src_3210, &wt, USE_DIST_WEIGHTED);             \
  |  | 1070|  27.9k|                                                                              \
  |  | 1071|  27.9k|      res_3210 = convolve_rounding(&res_3210, &offset_const, &rounding_const, \
  |  | 1072|  27.9k|                                   rounding_shift);                           \
  |  | 1073|  27.9k|                                                                              \
  |  | 1074|  27.9k|      res_3210 = _mm256_packus_epi16(res_3210, res_3210);                     \
  |  | 1075|  27.9k|      const __m128i res_10 = _mm256_castsi256_si128(res_3210);                \
  |  | 1076|  27.9k|      const __m128i res_32 = _mm256_extracti128_si256(res_3210, 1);           \
  |  | 1077|  27.9k|                                                                              \
  |  | 1078|  27.9k|      *(int *)(&dst0[0 * dst_stride0]) = _mm_cvtsi128_si32(res_10);           \
  |  | 1079|  27.9k|      *(int *)(&dst0[2 * dst_stride0]) = _mm_cvtsi128_si32(res_32);           \
  |  | 1080|  27.9k|      *(int *)(&dst0[1 * dst_stride0]) = _mm_extract_epi32(res_10, 1);        \
  |  | 1081|  27.9k|      *(int *)(&dst0[3 * dst_stride0]) = _mm_extract_epi32(res_32, 1);        \
  |  | 1082|  27.9k|      i -= 4;                                                                 \
  |  | 1083|  27.9k|      src += 4 * src_stride;                                                  \
  |  | 1084|  27.9k|      dst += 4 * dst_stride;                                                  \
  |  | 1085|  27.9k|      dst0 += 4 * dst_stride0;                                                \
  |  | 1086|  27.9k|    } while (i);                                                              \
  |  |  ------------------
  |  |  |  Branch (1086:14): [True: 8.99k, False: 18.9k]
  |  |  ------------------
  |  | 1087|  18.9k|  }
  ------------------
 1116|   366k|    } else {
 1117|   366k|      DO_AVG_2D_COPY(0)
  ------------------
  |  |  957|   366k|  int i = h;                                                                  \
  |  |  958|   366k|  if (w >= 16) {                                                              \
  |  |  ------------------
  |  |  |  Branch (958:7): [True: 325k, False: 41.4k]
  |  |  ------------------
  |  |  959|   325k|    __m256i src_0, src_1, src_2, src_3;                                       \
  |  |  960|   325k|    __m256i ref_0, ref_1, ref_2, ref_3;                                       \
  |  |  961|   325k|    __m256i res_0, res_1, res_2, res_3;                                       \
  |  |  962|   325k|    __m256i res_10, res_32;                                                   \
  |  |  963|   325k|    if (w == 128) {                                                           \
  |  |  ------------------
  |  |  |  Branch (963:9): [True: 65.2k, False: 260k]
  |  |  ------------------
  |  |  964|  8.33M|      do {                                                                    \
  |  |  965|  8.33M|        DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48);    \
  |  |  ------------------
  |  |  |  |  903|  8.33M|  do {                                                                         \
  |  |  |  |  904|  8.33M|    src_0 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  905|  8.33M|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));             \
  |  |  |  |  906|  8.33M|    src_1 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  907|  8.33M|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));             \
  |  |  |  |  908|  8.33M|    src_2 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  909|  8.33M|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));             \
  |  |  |  |  910|  8.33M|    src_3 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  911|  8.33M|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));             \
  |  |  |  |  912|  8.33M|                                                                               \
  |  |  |  |  913|  8.33M|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  8.33M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  8.33M|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  914|  8.33M|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  8.33M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  8.33M|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  915|  8.33M|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  8.33M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  8.33M|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  916|  8.33M|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  8.33M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  8.33M|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  917|  8.33M|    src_0 = _mm256_add_epi16(src_0, offset_const);                             \
  |  |  |  |  918|  8.33M|    src_1 = _mm256_add_epi16(src_1, offset_const);                             \
  |  |  |  |  919|  8.33M|    src_2 = _mm256_add_epi16(src_2, offset_const);                             \
  |  |  |  |  920|  8.33M|    src_3 = _mm256_add_epi16(src_3, offset_const);                             \
  |  |  |  |  921|  8.33M|                                                                               \
  |  |  |  |  922|  8.33M|    ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0]));       \
  |  |  |  |  923|  8.33M|    ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1]));       \
  |  |  |  |  924|  8.33M|    ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2]));       \
  |  |  |  |  925|  8.33M|    ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3]));       \
  |  |  |  |  926|  8.33M|                                                                               \
  |  |  |  |  927|  8.33M|    res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  928|  8.33M|    res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  929|  8.33M|    res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  930|  8.33M|    res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  931|  8.33M|                                                                               \
  |  |  |  |  932|  8.33M|    res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const,          \
  |  |  |  |  933|  8.33M|                              rounding_shift);                                 \
  |  |  |  |  934|  8.33M|    res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const,          \
  |  |  |  |  935|  8.33M|                              rounding_shift);                                 \
  |  |  |  |  936|  8.33M|    res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const,          \
  |  |  |  |  937|  8.33M|                              rounding_shift);                                 \
  |  |  |  |  938|  8.33M|    res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const,          \
  |  |  |  |  939|  8.33M|                              rounding_shift);                                 \
  |  |  |  |  940|  8.33M|                                                                               \
  |  |  |  |  941|  8.33M|    res_10 = _mm256_packus_epi16(res_0, res_1);                                \
  |  |  |  |  942|  8.33M|    res_32 = _mm256_packus_epi16(res_2, res_3);                                \
  |  |  |  |  943|  8.33M|    res_10 = _mm256_permute4x64_epi64(res_10, 0xD8);                           \
  |  |  |  |  944|  8.33M|    res_32 = _mm256_permute4x64_epi64(res_32, 0xD8);                           \
  |  |  |  |  945|  8.33M|                                                                               \
  |  |  |  |  946|  8.33M|    _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]),                 \
  |  |  |  |  947|  8.33M|                    _mm256_castsi256_si128(res_10));                           \
  |  |  |  |  948|  8.33M|    _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]),                 \
  |  |  |  |  949|  8.33M|                    _mm256_extracti128_si256(res_10, 1));                      \
  |  |  |  |  950|  8.33M|    _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]),                 \
  |  |  |  |  951|  8.33M|                    _mm256_castsi256_si128(res_32));                           \
  |  |  |  |  952|  8.33M|    _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]),                 \
  |  |  |  |  953|  8.33M|                    _mm256_extracti128_si256(res_32, 1));                      \
  |  |  |  |  954|  8.33M|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (954:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  966|  8.33M|        DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 64, 0, 80, 0, 96, 0, 112);  \
  |  |  ------------------
  |  |  |  |  903|  8.33M|  do {                                                                         \
  |  |  |  |  904|  8.33M|    src_0 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  905|  8.33M|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));             \
  |  |  |  |  906|  8.33M|    src_1 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  907|  8.33M|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));             \
  |  |  |  |  908|  8.33M|    src_2 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  909|  8.33M|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));             \
  |  |  |  |  910|  8.33M|    src_3 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  911|  8.33M|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));             \
  |  |  |  |  912|  8.33M|                                                                               \
  |  |  |  |  913|  8.33M|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  8.33M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  8.33M|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  914|  8.33M|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  8.33M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  8.33M|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  915|  8.33M|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  8.33M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  8.33M|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  916|  8.33M|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  8.33M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  8.33M|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  917|  8.33M|    src_0 = _mm256_add_epi16(src_0, offset_const);                             \
  |  |  |  |  918|  8.33M|    src_1 = _mm256_add_epi16(src_1, offset_const);                             \
  |  |  |  |  919|  8.33M|    src_2 = _mm256_add_epi16(src_2, offset_const);                             \
  |  |  |  |  920|  8.33M|    src_3 = _mm256_add_epi16(src_3, offset_const);                             \
  |  |  |  |  921|  8.33M|                                                                               \
  |  |  |  |  922|  8.33M|    ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0]));       \
  |  |  |  |  923|  8.33M|    ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1]));       \
  |  |  |  |  924|  8.33M|    ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2]));       \
  |  |  |  |  925|  8.33M|    ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3]));       \
  |  |  |  |  926|  8.33M|                                                                               \
  |  |  |  |  927|  8.33M|    res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  928|  8.33M|    res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  929|  8.33M|    res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  930|  8.33M|    res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  931|  8.33M|                                                                               \
  |  |  |  |  932|  8.33M|    res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const,          \
  |  |  |  |  933|  8.33M|                              rounding_shift);                                 \
  |  |  |  |  934|  8.33M|    res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const,          \
  |  |  |  |  935|  8.33M|                              rounding_shift);                                 \
  |  |  |  |  936|  8.33M|    res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const,          \
  |  |  |  |  937|  8.33M|                              rounding_shift);                                 \
  |  |  |  |  938|  8.33M|    res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const,          \
  |  |  |  |  939|  8.33M|                              rounding_shift);                                 \
  |  |  |  |  940|  8.33M|                                                                               \
  |  |  |  |  941|  8.33M|    res_10 = _mm256_packus_epi16(res_0, res_1);                                \
  |  |  |  |  942|  8.33M|    res_32 = _mm256_packus_epi16(res_2, res_3);                                \
  |  |  |  |  943|  8.33M|    res_10 = _mm256_permute4x64_epi64(res_10, 0xD8);                           \
  |  |  |  |  944|  8.33M|    res_32 = _mm256_permute4x64_epi64(res_32, 0xD8);                           \
  |  |  |  |  945|  8.33M|                                                                               \
  |  |  |  |  946|  8.33M|    _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]),                 \
  |  |  |  |  947|  8.33M|                    _mm256_castsi256_si128(res_10));                           \
  |  |  |  |  948|  8.33M|    _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]),                 \
  |  |  |  |  949|  8.33M|                    _mm256_extracti128_si256(res_10, 1));                      \
  |  |  |  |  950|  8.33M|    _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]),                 \
  |  |  |  |  951|  8.33M|                    _mm256_castsi256_si128(res_32));                           \
  |  |  |  |  952|  8.33M|    _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]),                 \
  |  |  |  |  953|  8.33M|                    _mm256_extracti128_si256(res_32, 1));                      \
  |  |  |  |  954|  8.33M|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (954:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  967|  8.33M|        i -= 1;                                                               \
  |  |  968|  8.33M|        src += 1 * src_stride;                                                \
  |  |  969|  8.33M|        dst += 1 * dst_stride;                                                \
  |  |  970|  8.33M|        dst0 += 1 * dst_stride0;                                              \
  |  |  971|  8.33M|      } while (i);                                                            \
  |  |  ------------------
  |  |  |  Branch (971:16): [True: 8.26M, False: 65.2k]
  |  |  ------------------
  |  |  972|   260k|    } else if (w == 64) {                                                     \
  |  |  ------------------
  |  |  |  Branch (972:16): [True: 165k, False: 94.3k]
  |  |  ------------------
  |  |  973|  12.6M|      do {                                                                    \
  |  |  974|  12.6M|        DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48);    \
  |  |  ------------------
  |  |  |  |  903|  12.6M|  do {                                                                         \
  |  |  |  |  904|  12.6M|    src_0 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  905|  12.6M|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));             \
  |  |  |  |  906|  12.6M|    src_1 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  907|  12.6M|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));             \
  |  |  |  |  908|  12.6M|    src_2 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  909|  12.6M|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));             \
  |  |  |  |  910|  12.6M|    src_3 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  911|  12.6M|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));             \
  |  |  |  |  912|  12.6M|                                                                               \
  |  |  |  |  913|  12.6M|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  12.6M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  12.6M|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  914|  12.6M|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  12.6M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  12.6M|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  915|  12.6M|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  12.6M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  12.6M|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  916|  12.6M|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  12.6M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  12.6M|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  917|  12.6M|    src_0 = _mm256_add_epi16(src_0, offset_const);                             \
  |  |  |  |  918|  12.6M|    src_1 = _mm256_add_epi16(src_1, offset_const);                             \
  |  |  |  |  919|  12.6M|    src_2 = _mm256_add_epi16(src_2, offset_const);                             \
  |  |  |  |  920|  12.6M|    src_3 = _mm256_add_epi16(src_3, offset_const);                             \
  |  |  |  |  921|  12.6M|                                                                               \
  |  |  |  |  922|  12.6M|    ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0]));       \
  |  |  |  |  923|  12.6M|    ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1]));       \
  |  |  |  |  924|  12.6M|    ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2]));       \
  |  |  |  |  925|  12.6M|    ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3]));       \
  |  |  |  |  926|  12.6M|                                                                               \
  |  |  |  |  927|  12.6M|    res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  928|  12.6M|    res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  929|  12.6M|    res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  930|  12.6M|    res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  931|  12.6M|                                                                               \
  |  |  |  |  932|  12.6M|    res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const,          \
  |  |  |  |  933|  12.6M|                              rounding_shift);                                 \
  |  |  |  |  934|  12.6M|    res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const,          \
  |  |  |  |  935|  12.6M|                              rounding_shift);                                 \
  |  |  |  |  936|  12.6M|    res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const,          \
  |  |  |  |  937|  12.6M|                              rounding_shift);                                 \
  |  |  |  |  938|  12.6M|    res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const,          \
  |  |  |  |  939|  12.6M|                              rounding_shift);                                 \
  |  |  |  |  940|  12.6M|                                                                               \
  |  |  |  |  941|  12.6M|    res_10 = _mm256_packus_epi16(res_0, res_1);                                \
  |  |  |  |  942|  12.6M|    res_32 = _mm256_packus_epi16(res_2, res_3);                                \
  |  |  |  |  943|  12.6M|    res_10 = _mm256_permute4x64_epi64(res_10, 0xD8);                           \
  |  |  |  |  944|  12.6M|    res_32 = _mm256_permute4x64_epi64(res_32, 0xD8);                           \
  |  |  |  |  945|  12.6M|                                                                               \
  |  |  |  |  946|  12.6M|    _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]),                 \
  |  |  |  |  947|  12.6M|                    _mm256_castsi256_si128(res_10));                           \
  |  |  |  |  948|  12.6M|    _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]),                 \
  |  |  |  |  949|  12.6M|                    _mm256_extracti128_si256(res_10, 1));                      \
  |  |  |  |  950|  12.6M|    _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]),                 \
  |  |  |  |  951|  12.6M|                    _mm256_castsi256_si128(res_32));                           \
  |  |  |  |  952|  12.6M|    _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]),                 \
  |  |  |  |  953|  12.6M|                    _mm256_extracti128_si256(res_32, 1));                      \
  |  |  |  |  954|  12.6M|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (954:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  975|  12.6M|                                                                              \
  |  |  976|  12.6M|        i -= 1;                                                               \
  |  |  977|  12.6M|        src += 1 * src_stride;                                                \
  |  |  978|  12.6M|        dst += 1 * dst_stride;                                                \
  |  |  979|  12.6M|        dst0 += 1 * dst_stride0;                                              \
  |  |  980|  12.6M|      } while (i);                                                            \
  |  |  ------------------
  |  |  |  Branch (980:16): [True: 12.4M, False: 165k]
  |  |  ------------------
  |  |  981|   165k|    } else if (w == 32) {                                                     \
  |  |  ------------------
  |  |  |  Branch (981:16): [True: 75.8k, False: 18.4k]
  |  |  ------------------
  |  |  982|  2.20M|      do {                                                                    \
  |  |  983|  2.20M|        DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 0, 16, 1, 16);     \
  |  |  ------------------
  |  |  |  |  903|  2.20M|  do {                                                                         \
  |  |  |  |  904|  2.20M|    src_0 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  905|  2.20M|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));             \
  |  |  |  |  906|  2.20M|    src_1 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  907|  2.20M|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));             \
  |  |  |  |  908|  2.20M|    src_2 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  909|  2.20M|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));             \
  |  |  |  |  910|  2.20M|    src_3 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  911|  2.20M|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));             \
  |  |  |  |  912|  2.20M|                                                                               \
  |  |  |  |  913|  2.20M|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  2.20M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  2.20M|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  914|  2.20M|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  2.20M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  2.20M|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  915|  2.20M|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  2.20M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  2.20M|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  916|  2.20M|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  2.20M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  2.20M|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  917|  2.20M|    src_0 = _mm256_add_epi16(src_0, offset_const);                             \
  |  |  |  |  918|  2.20M|    src_1 = _mm256_add_epi16(src_1, offset_const);                             \
  |  |  |  |  919|  2.20M|    src_2 = _mm256_add_epi16(src_2, offset_const);                             \
  |  |  |  |  920|  2.20M|    src_3 = _mm256_add_epi16(src_3, offset_const);                             \
  |  |  |  |  921|  2.20M|                                                                               \
  |  |  |  |  922|  2.20M|    ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0]));       \
  |  |  |  |  923|  2.20M|    ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1]));       \
  |  |  |  |  924|  2.20M|    ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2]));       \
  |  |  |  |  925|  2.20M|    ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3]));       \
  |  |  |  |  926|  2.20M|                                                                               \
  |  |  |  |  927|  2.20M|    res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  928|  2.20M|    res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  929|  2.20M|    res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  930|  2.20M|    res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  931|  2.20M|                                                                               \
  |  |  |  |  932|  2.20M|    res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const,          \
  |  |  |  |  933|  2.20M|                              rounding_shift);                                 \
  |  |  |  |  934|  2.20M|    res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const,          \
  |  |  |  |  935|  2.20M|                              rounding_shift);                                 \
  |  |  |  |  936|  2.20M|    res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const,          \
  |  |  |  |  937|  2.20M|                              rounding_shift);                                 \
  |  |  |  |  938|  2.20M|    res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const,          \
  |  |  |  |  939|  2.20M|                              rounding_shift);                                 \
  |  |  |  |  940|  2.20M|                                                                               \
  |  |  |  |  941|  2.20M|    res_10 = _mm256_packus_epi16(res_0, res_1);                                \
  |  |  |  |  942|  2.20M|    res_32 = _mm256_packus_epi16(res_2, res_3);                                \
  |  |  |  |  943|  2.20M|    res_10 = _mm256_permute4x64_epi64(res_10, 0xD8);                           \
  |  |  |  |  944|  2.20M|    res_32 = _mm256_permute4x64_epi64(res_32, 0xD8);                           \
  |  |  |  |  945|  2.20M|                                                                               \
  |  |  |  |  946|  2.20M|    _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]),                 \
  |  |  |  |  947|  2.20M|                    _mm256_castsi256_si128(res_10));                           \
  |  |  |  |  948|  2.20M|    _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]),                 \
  |  |  |  |  949|  2.20M|                    _mm256_extracti128_si256(res_10, 1));                      \
  |  |  |  |  950|  2.20M|    _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]),                 \
  |  |  |  |  951|  2.20M|                    _mm256_castsi256_si128(res_32));                           \
  |  |  |  |  952|  2.20M|    _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]),                 \
  |  |  |  |  953|  2.20M|                    _mm256_extracti128_si256(res_32, 1));                      \
  |  |  |  |  954|  2.20M|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (954:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  984|  2.20M|                                                                              \
  |  |  985|  2.20M|        i -= 2;                                                               \
  |  |  986|  2.20M|        src += 2 * src_stride;                                                \
  |  |  987|  2.20M|        dst += 2 * dst_stride;                                                \
  |  |  988|  2.20M|        dst0 += 2 * dst_stride0;                                              \
  |  |  989|  2.20M|      } while (i);                                                            \
  |  |  ------------------
  |  |  |  Branch (989:16): [True: 2.13M, False: 75.8k]
  |  |  ------------------
  |  |  990|  75.8k|    } else {                                                                  \
  |  |  991|  18.4k|      assert(w == 16);                                                        \
  |  |  992|  82.1k|      do {                                                                    \
  |  |  993|  82.1k|        DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 2, 0, 3, 0);       \
  |  |  ------------------
  |  |  |  |  903|  82.1k|  do {                                                                         \
  |  |  |  |  904|  82.1k|    src_0 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  905|  82.1k|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));             \
  |  |  |  |  906|  82.1k|    src_1 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  907|  82.1k|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));             \
  |  |  |  |  908|  82.1k|    src_2 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  909|  82.1k|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));             \
  |  |  |  |  910|  82.1k|    src_3 = _mm256_cvtepu8_epi16(                                              \
  |  |  |  |  911|  82.1k|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));             \
  |  |  |  |  912|  82.1k|                                                                               \
  |  |  |  |  913|  82.1k|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  82.1k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  82.1k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  914|  82.1k|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  82.1k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  82.1k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  915|  82.1k|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  82.1k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  82.1k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  916|  82.1k|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                              \
  |  |  |  |  ------------------
  |  |  |  |  |  |  822|  82.1k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   21|  82.1k|#define FILTER_BITS 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |  917|  82.1k|    src_0 = _mm256_add_epi16(src_0, offset_const);                             \
  |  |  |  |  918|  82.1k|    src_1 = _mm256_add_epi16(src_1, offset_const);                             \
  |  |  |  |  919|  82.1k|    src_2 = _mm256_add_epi16(src_2, offset_const);                             \
  |  |  |  |  920|  82.1k|    src_3 = _mm256_add_epi16(src_3, offset_const);                             \
  |  |  |  |  921|  82.1k|                                                                               \
  |  |  |  |  922|  82.1k|    ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0]));       \
  |  |  |  |  923|  82.1k|    ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1]));       \
  |  |  |  |  924|  82.1k|    ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2]));       \
  |  |  |  |  925|  82.1k|    ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3]));       \
  |  |  |  |  926|  82.1k|                                                                               \
  |  |  |  |  927|  82.1k|    res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  928|  82.1k|    res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  929|  82.1k|    res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  930|  82.1k|    res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED);                  \
  |  |  |  |  931|  82.1k|                                                                               \
  |  |  |  |  932|  82.1k|    res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const,          \
  |  |  |  |  933|  82.1k|                              rounding_shift);                                 \
  |  |  |  |  934|  82.1k|    res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const,          \
  |  |  |  |  935|  82.1k|                              rounding_shift);                                 \
  |  |  |  |  936|  82.1k|    res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const,          \
  |  |  |  |  937|  82.1k|                              rounding_shift);                                 \
  |  |  |  |  938|  82.1k|    res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const,          \
  |  |  |  |  939|  82.1k|                              rounding_shift);                                 \
  |  |  |  |  940|  82.1k|                                                                               \
  |  |  |  |  941|  82.1k|    res_10 = _mm256_packus_epi16(res_0, res_1);                                \
  |  |  |  |  942|  82.1k|    res_32 = _mm256_packus_epi16(res_2, res_3);                                \
  |  |  |  |  943|  82.1k|    res_10 = _mm256_permute4x64_epi64(res_10, 0xD8);                           \
  |  |  |  |  944|  82.1k|    res_32 = _mm256_permute4x64_epi64(res_32, 0xD8);                           \
  |  |  |  |  945|  82.1k|                                                                               \
  |  |  |  |  946|  82.1k|    _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]),                 \
  |  |  |  |  947|  82.1k|                    _mm256_castsi256_si128(res_10));                           \
  |  |  |  |  948|  82.1k|    _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]),                 \
  |  |  |  |  949|  82.1k|                    _mm256_extracti128_si256(res_10, 1));                      \
  |  |  |  |  950|  82.1k|    _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]),                 \
  |  |  |  |  951|  82.1k|                    _mm256_castsi256_si128(res_32));                           \
  |  |  |  |  952|  82.1k|    _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]),                 \
  |  |  |  |  953|  82.1k|                    _mm256_extracti128_si256(res_32, 1));                      \
  |  |  |  |  954|  82.1k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (954:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  994|  82.1k|                                                                              \
  |  |  995|  82.1k|        i -= 4;                                                               \
  |  |  996|  82.1k|        src += 4 * src_stride;                                                \
  |  |  997|  82.1k|        dst += 4 * dst_stride;                                                \
  |  |  998|  82.1k|        dst0 += 4 * dst_stride0;                                              \
  |  |  999|  82.1k|      } while (i);                                                            \
  |  |  ------------------
  |  |  |  Branch (999:16): [True: 63.4k, False: 18.6k]
  |  |  ------------------
  |  | 1000|  18.6k|    }                                                                         \
  |  | 1001|   325k|  } else if (w == 8) {                                                        \
  |  |  ------------------
  |  |  |  Branch (1001:14): [True: 25.7k, False: 15.6k]
  |  |  ------------------
  |  | 1002|  72.1k|    do {                                                                      \
  |  | 1003|  72.1k|      const __m128i src_0 =                                                   \
  |  | 1004|  72.1k|          _mm_loadl_epi64((__m128i *)(&src[0 * src_stride]));                 \
  |  | 1005|  72.1k|      const __m128i src_1 =                                                   \
  |  | 1006|  72.1k|          _mm_loadl_epi64((__m128i *)(&src[1 * src_stride]));                 \
  |  | 1007|  72.1k|      const __m128i src_2 =                                                   \
  |  | 1008|  72.1k|          _mm_loadl_epi64((__m128i *)(&src[2 * src_stride]));                 \
  |  | 1009|  72.1k|      const __m128i src_3 =                                                   \
  |  | 1010|  72.1k|          _mm_loadl_epi64((__m128i *)(&src[3 * src_stride]));                 \
  |  | 1011|  72.1k|      __m256i src_10 =                                                        \
  |  | 1012|  72.1k|          _mm256_insertf128_si256(_mm256_castsi128_si256(src_0), src_1, 1);   \
  |  | 1013|  72.1k|      __m256i src_32 =                                                        \
  |  | 1014|  72.1k|          _mm256_insertf128_si256(_mm256_castsi128_si256(src_2), src_3, 1);   \
  |  | 1015|  72.1k|                                                                              \
  |  | 1016|  72.1k|      src_10 = _mm256_unpacklo_epi8(src_10, zero);                            \
  |  | 1017|  72.1k|      src_32 = _mm256_unpacklo_epi8(src_32, zero);                            \
  |  | 1018|  72.1k|                                                                              \
  |  | 1019|  72.1k|      src_10 = _mm256_slli_epi16(src_10, LEFT_SHIFT);                         \
  |  |  ------------------
  |  |  |  |  822|  72.1k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  72.1k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 1020|  72.1k|      src_32 = _mm256_slli_epi16(src_32, LEFT_SHIFT);                         \
  |  |  ------------------
  |  |  |  |  822|  72.1k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  72.1k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 1021|  72.1k|                                                                              \
  |  | 1022|  72.1k|      src_10 = _mm256_add_epi16(src_10, offset_const);                        \
  |  | 1023|  72.1k|      src_32 = _mm256_add_epi16(src_32, offset_const);                        \
  |  | 1024|  72.1k|                                                                              \
  |  | 1025|  72.1k|      const __m256i ref_10 =                                                  \
  |  | 1026|  72.1k|          load_line2_avx2(&dst[0 * dst_stride], &dst[1 * dst_stride]);        \
  |  | 1027|  72.1k|      const __m256i ref_32 =                                                  \
  |  | 1028|  72.1k|          load_line2_avx2(&dst[2 * dst_stride], &dst[3 * dst_stride]);        \
  |  | 1029|  72.1k|      __m256i res_10 = comp_avg(&ref_10, &src_10, &wt, USE_DIST_WEIGHTED);    \
  |  | 1030|  72.1k|      __m256i res_32 = comp_avg(&ref_32, &src_32, &wt, USE_DIST_WEIGHTED);    \
  |  | 1031|  72.1k|                                                                              \
  |  | 1032|  72.1k|      res_10 = convolve_rounding(&res_10, &offset_const, &rounding_const,     \
  |  | 1033|  72.1k|                                 rounding_shift);                             \
  |  | 1034|  72.1k|      res_32 = convolve_rounding(&res_32, &offset_const, &rounding_const,     \
  |  | 1035|  72.1k|                                 rounding_shift);                             \
  |  | 1036|  72.1k|                                                                              \
  |  | 1037|  72.1k|      __m256i res = _mm256_packus_epi16(res_10, res_32);                      \
  |  | 1038|  72.1k|      const __m128i res_20 = _mm256_castsi256_si128(res);                     \
  |  | 1039|  72.1k|      const __m128i res_31 = _mm256_extracti128_si256(res, 1);                \
  |  | 1040|  72.1k|                                                                              \
  |  | 1041|  72.1k|      _mm_storel_epi64((__m128i *)(&dst0[0 * dst_stride0]), res_20);          \
  |  | 1042|  72.1k|      _mm_storel_epi64((__m128i *)((&dst0[1 * dst_stride0])), res_31);        \
  |  | 1043|  72.1k|      _mm_storeh_epi64((__m128i *)(&dst0[2 * dst_stride0]), res_20);          \
  |  | 1044|  72.1k|      _mm_storeh_epi64((__m128i *)((&dst0[3 * dst_stride0])), res_31);        \
  |  | 1045|  72.1k|      i -= 4;                                                                 \
  |  | 1046|  72.1k|      src += 4 * src_stride;                                                  \
  |  | 1047|  72.1k|      dst += 4 * dst_stride;                                                  \
  |  | 1048|  72.1k|      dst0 += 4 * dst_stride0;                                                \
  |  | 1049|  72.1k|    } while (i);                                                              \
  |  |  ------------------
  |  |  |  Branch (1049:14): [True: 46.3k, False: 25.7k]
  |  |  ------------------
  |  | 1050|  25.7k|  } else {                                                                    \
  |  | 1051|  15.6k|    assert(w == 4);                                                           \
  |  | 1052|  24.5k|    do {                                                                      \
  |  | 1053|  24.5k|      __m256i src_3210_8bit =                                                 \
  |  | 1054|  24.5k|          _mm256_setr_epi32(loadu_int32(src + 0 * src_stride),                \
  |  | 1055|  24.5k|                            loadu_int32(src + 1 * src_stride), 0, 0,          \
  |  | 1056|  24.5k|                            loadu_int32(src + 2 * src_stride),                \
  |  | 1057|  24.5k|                            loadu_int32(src + 3 * src_stride), 0, 0);         \
  |  | 1058|  24.5k|                                                                              \
  |  | 1059|  24.5k|      __m256i src_3210 = _mm256_unpacklo_epi8(src_3210_8bit, zero);           \
  |  | 1060|  24.5k|      src_3210 = _mm256_slli_epi16(src_3210, LEFT_SHIFT);                     \
  |  |  ------------------
  |  |  |  |  822|  24.5k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  24.5k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  | 1061|  24.5k|      src_3210 = _mm256_add_epi16(src_3210, offset_const);                    \
  |  | 1062|  24.5k|                                                                              \
  |  | 1063|  24.5k|      __m256i ref_3210 =                                                      \
  |  | 1064|  24.5k|          _mm256_setr_epi64x(*(int64_t *)(dst + 0 * dst_stride),              \
  |  | 1065|  24.5k|                             *(int64_t *)(dst + 1 * dst_stride),              \
  |  | 1066|  24.5k|                             *(int64_t *)(dst + 2 * dst_stride),              \
  |  | 1067|  24.5k|                             *(int64_t *)(dst + 3 * dst_stride));             \
  |  | 1068|  24.5k|      __m256i res_3210 =                                                      \
  |  | 1069|  24.5k|          comp_avg(&ref_3210, &src_3210, &wt, USE_DIST_WEIGHTED);             \
  |  | 1070|  24.5k|                                                                              \
  |  | 1071|  24.5k|      res_3210 = convolve_rounding(&res_3210, &offset_const, &rounding_const, \
  |  | 1072|  24.5k|                                   rounding_shift);                           \
  |  | 1073|  24.5k|                                                                              \
  |  | 1074|  24.5k|      res_3210 = _mm256_packus_epi16(res_3210, res_3210);                     \
  |  | 1075|  24.5k|      const __m128i res_10 = _mm256_castsi256_si128(res_3210);                \
  |  | 1076|  24.5k|      const __m128i res_32 = _mm256_extracti128_si256(res_3210, 1);           \
  |  | 1077|  24.5k|                                                                              \
  |  | 1078|  24.5k|      *(int *)(&dst0[0 * dst_stride0]) = _mm_cvtsi128_si32(res_10);           \
  |  | 1079|  24.5k|      *(int *)(&dst0[2 * dst_stride0]) = _mm_cvtsi128_si32(res_32);           \
  |  | 1080|  24.5k|      *(int *)(&dst0[1 * dst_stride0]) = _mm_extract_epi32(res_10, 1);        \
  |  | 1081|  24.5k|      *(int *)(&dst0[3 * dst_stride0]) = _mm_extract_epi32(res_32, 1);        \
  |  | 1082|  24.5k|      i -= 4;                                                                 \
  |  | 1083|  24.5k|      src += 4 * src_stride;                                                  \
  |  | 1084|  24.5k|      dst += 4 * dst_stride;                                                  \
  |  | 1085|  24.5k|      dst0 += 4 * dst_stride0;                                                \
  |  | 1086|  24.5k|    } while (i);                                                              \
  |  |  ------------------
  |  |  |  Branch (1086:14): [True: 8.91k, False: 15.6k]
  |  |  ------------------
  |  | 1087|  15.6k|  }
  ------------------
 1118|   366k|    }
 1119|   534k|  } else {
 1120|   534k|    av1_dist_wtd_convolve_2d_no_avg_copy_avx2(src, src_stride, dst, dst_stride,
 1121|   534k|                                              w, h, offset_const);
 1122|   534k|  }
 1123|   975k|}
jnt_convolve_avx2.c:unpack_weights_avx2:
   27|  1.44M|static inline __m256i unpack_weights_avx2(ConvolveParams *conv_params) {
   28|  1.44M|  const int w0 = conv_params->fwd_offset;
   29|  1.44M|  const int w1 = conv_params->bck_offset;
   30|  1.44M|  const __m256i wt0 = _mm256_set1_epi16((int16_t)w0);
   31|  1.44M|  const __m256i wt1 = _mm256_set1_epi16((int16_t)w1);
   32|  1.44M|  const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
   33|  1.44M|  return wt;
   34|  1.44M|}
jnt_convolve_avx2.c:load_line2_avx2:
   36|  8.94M|static inline __m256i load_line2_avx2(const void *a, const void *b) {
   37|  8.94M|  return _mm256_permute2x128_si256(
   38|  8.94M|      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)a)),
   39|  8.94M|      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20);
   40|  8.94M|}
jnt_convolve_avx2.c:av1_dist_wtd_convolve_2d_no_avg_copy_avx2:
  825|   534k|    int w, int h, const __m256i offset_const) {
  826|   534k|  int i = h;
  827|   534k|  if (w >= 16) {
  ------------------
  |  Branch (827:7): [True: 385k, False: 149k]
  ------------------
  828|   385k|    __m256i src_0, src_1, src_2, src_3;
  829|   385k|    if (w == 128) {
  ------------------
  |  Branch (829:9): [True: 65.8k, False: 319k]
  ------------------
  830|  8.39M|      do {
  831|  8.39M|        DO_NO_AVG_2D_COPY_4X16(0, 0, 0, 16, 0, 32, 0, 48);
  ------------------
  |  |  796|  8.39M|  do {                                                                  \
  |  |  797|  8.39M|    src_0 = _mm256_cvtepu8_epi16(                                       \
  |  |  798|  8.39M|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));      \
  |  |  799|  8.39M|    src_1 = _mm256_cvtepu8_epi16(                                       \
  |  |  800|  8.39M|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));      \
  |  |  801|  8.39M|    src_2 = _mm256_cvtepu8_epi16(                                       \
  |  |  802|  8.39M|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));      \
  |  |  803|  8.39M|    src_3 = _mm256_cvtepu8_epi16(                                       \
  |  |  804|  8.39M|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));      \
  |  |  805|  8.39M|                                                                        \
  |  |  806|  8.39M|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  8.39M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  8.39M|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  807|  8.39M|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  8.39M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  8.39M|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  808|  8.39M|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  8.39M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  8.39M|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  809|  8.39M|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  8.39M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  8.39M|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  810|  8.39M|                                                                        \
  |  |  811|  8.39M|    src_0 = _mm256_add_epi16(src_0, offset_const);                      \
  |  |  812|  8.39M|    src_1 = _mm256_add_epi16(src_1, offset_const);                      \
  |  |  813|  8.39M|    src_2 = _mm256_add_epi16(src_2, offset_const);                      \
  |  |  814|  8.39M|    src_3 = _mm256_add_epi16(src_3, offset_const);                      \
  |  |  815|  8.39M|                                                                        \
  |  |  816|  8.39M|    _mm256_store_si256((__m256i *)(&dst[r0 * dst_stride + c0]), src_0); \
  |  |  817|  8.39M|    _mm256_store_si256((__m256i *)(&dst[r1 * dst_stride + c1]), src_1); \
  |  |  818|  8.39M|    _mm256_store_si256((__m256i *)(&dst[r2 * dst_stride + c2]), src_2); \
  |  |  819|  8.39M|    _mm256_store_si256((__m256i *)(&dst[r3 * dst_stride + c3]), src_3); \
  |  |  820|  8.39M|  } while (0)
  |  |  ------------------
  |  |  |  Branch (820:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  832|  8.39M|        DO_NO_AVG_2D_COPY_4X16(0, 64, 0, 80, 0, 96, 0, 112);
  ------------------
  |  |  796|  8.39M|  do {                                                                  \
  |  |  797|  8.39M|    src_0 = _mm256_cvtepu8_epi16(                                       \
  |  |  798|  8.39M|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));      \
  |  |  799|  8.39M|    src_1 = _mm256_cvtepu8_epi16(                                       \
  |  |  800|  8.39M|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));      \
  |  |  801|  8.39M|    src_2 = _mm256_cvtepu8_epi16(                                       \
  |  |  802|  8.39M|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));      \
  |  |  803|  8.39M|    src_3 = _mm256_cvtepu8_epi16(                                       \
  |  |  804|  8.39M|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));      \
  |  |  805|  8.39M|                                                                        \
  |  |  806|  8.39M|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  8.39M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  8.39M|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  807|  8.39M|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  8.39M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  8.39M|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  808|  8.39M|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  8.39M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  8.39M|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  809|  8.39M|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  8.39M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  8.39M|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  810|  8.39M|                                                                        \
  |  |  811|  8.39M|    src_0 = _mm256_add_epi16(src_0, offset_const);                      \
  |  |  812|  8.39M|    src_1 = _mm256_add_epi16(src_1, offset_const);                      \
  |  |  813|  8.39M|    src_2 = _mm256_add_epi16(src_2, offset_const);                      \
  |  |  814|  8.39M|    src_3 = _mm256_add_epi16(src_3, offset_const);                      \
  |  |  815|  8.39M|                                                                        \
  |  |  816|  8.39M|    _mm256_store_si256((__m256i *)(&dst[r0 * dst_stride + c0]), src_0); \
  |  |  817|  8.39M|    _mm256_store_si256((__m256i *)(&dst[r1 * dst_stride + c1]), src_1); \
  |  |  818|  8.39M|    _mm256_store_si256((__m256i *)(&dst[r2 * dst_stride + c2]), src_2); \
  |  |  819|  8.39M|    _mm256_store_si256((__m256i *)(&dst[r3 * dst_stride + c3]), src_3); \
  |  |  820|  8.39M|  } while (0)
  |  |  ------------------
  |  |  |  Branch (820:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  833|  8.39M|        src += 1 * src_stride;
  834|  8.39M|        dst += 1 * dst_stride;
  835|  8.39M|        i -= 1;
  836|  8.39M|      } while (i);
  ------------------
  |  Branch (836:16): [True: 8.32M, False: 65.8k]
  ------------------
  837|   319k|    } else if (w == 64) {
  ------------------
  |  Branch (837:16): [True: 169k, False: 149k]
  ------------------
  838|  12.8M|      do {
  839|  12.8M|        DO_NO_AVG_2D_COPY_4X16(0, 0, 0, 16, 0, 32, 0, 48);
  ------------------
  |  |  796|  12.8M|  do {                                                                  \
  |  |  797|  12.8M|    src_0 = _mm256_cvtepu8_epi16(                                       \
  |  |  798|  12.8M|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));      \
  |  |  799|  12.8M|    src_1 = _mm256_cvtepu8_epi16(                                       \
  |  |  800|  12.8M|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));      \
  |  |  801|  12.8M|    src_2 = _mm256_cvtepu8_epi16(                                       \
  |  |  802|  12.8M|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));      \
  |  |  803|  12.8M|    src_3 = _mm256_cvtepu8_epi16(                                       \
  |  |  804|  12.8M|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));      \
  |  |  805|  12.8M|                                                                        \
  |  |  806|  12.8M|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  12.8M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  12.8M|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  807|  12.8M|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  12.8M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  12.8M|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  808|  12.8M|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  12.8M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  12.8M|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  809|  12.8M|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  12.8M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  12.8M|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  810|  12.8M|                                                                        \
  |  |  811|  12.8M|    src_0 = _mm256_add_epi16(src_0, offset_const);                      \
  |  |  812|  12.8M|    src_1 = _mm256_add_epi16(src_1, offset_const);                      \
  |  |  813|  12.8M|    src_2 = _mm256_add_epi16(src_2, offset_const);                      \
  |  |  814|  12.8M|    src_3 = _mm256_add_epi16(src_3, offset_const);                      \
  |  |  815|  12.8M|                                                                        \
  |  |  816|  12.8M|    _mm256_store_si256((__m256i *)(&dst[r0 * dst_stride + c0]), src_0); \
  |  |  817|  12.8M|    _mm256_store_si256((__m256i *)(&dst[r1 * dst_stride + c1]), src_1); \
  |  |  818|  12.8M|    _mm256_store_si256((__m256i *)(&dst[r2 * dst_stride + c2]), src_2); \
  |  |  819|  12.8M|    _mm256_store_si256((__m256i *)(&dst[r3 * dst_stride + c3]), src_3); \
  |  |  820|  12.8M|  } while (0)
  |  |  ------------------
  |  |  |  Branch (820:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  840|  12.8M|        src += 1 * src_stride;
  841|  12.8M|        dst += 1 * dst_stride;
  842|  12.8M|        i -= 1;
  843|  12.8M|      } while (i);
  ------------------
  |  Branch (843:16): [True: 12.6M, False: 169k]
  ------------------
  844|   169k|    } else if (w == 32) {
  ------------------
  |  Branch (844:16): [True: 90.4k, False: 58.7k]
  ------------------
  845|  2.38M|      do {
  846|  2.38M|        DO_NO_AVG_2D_COPY_4X16(0, 0, 1, 0, 0, 16, 1, 16);
  ------------------
  |  |  796|  2.38M|  do {                                                                  \
  |  |  797|  2.38M|    src_0 = _mm256_cvtepu8_epi16(                                       \
  |  |  798|  2.38M|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));      \
  |  |  799|  2.38M|    src_1 = _mm256_cvtepu8_epi16(                                       \
  |  |  800|  2.38M|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));      \
  |  |  801|  2.38M|    src_2 = _mm256_cvtepu8_epi16(                                       \
  |  |  802|  2.38M|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));      \
  |  |  803|  2.38M|    src_3 = _mm256_cvtepu8_epi16(                                       \
  |  |  804|  2.38M|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));      \
  |  |  805|  2.38M|                                                                        \
  |  |  806|  2.38M|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  2.38M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  2.38M|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  807|  2.38M|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  2.38M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  2.38M|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  808|  2.38M|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  2.38M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  2.38M|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  809|  2.38M|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|  2.38M|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|  2.38M|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  810|  2.38M|                                                                        \
  |  |  811|  2.38M|    src_0 = _mm256_add_epi16(src_0, offset_const);                      \
  |  |  812|  2.38M|    src_1 = _mm256_add_epi16(src_1, offset_const);                      \
  |  |  813|  2.38M|    src_2 = _mm256_add_epi16(src_2, offset_const);                      \
  |  |  814|  2.38M|    src_3 = _mm256_add_epi16(src_3, offset_const);                      \
  |  |  815|  2.38M|                                                                        \
  |  |  816|  2.38M|    _mm256_store_si256((__m256i *)(&dst[r0 * dst_stride + c0]), src_0); \
  |  |  817|  2.38M|    _mm256_store_si256((__m256i *)(&dst[r1 * dst_stride + c1]), src_1); \
  |  |  818|  2.38M|    _mm256_store_si256((__m256i *)(&dst[r2 * dst_stride + c2]), src_2); \
  |  |  819|  2.38M|    _mm256_store_si256((__m256i *)(&dst[r3 * dst_stride + c3]), src_3); \
  |  |  820|  2.38M|  } while (0)
  |  |  ------------------
  |  |  |  Branch (820:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  847|  2.38M|        src += 2 * src_stride;
  848|  2.38M|        dst += 2 * dst_stride;
  849|  2.38M|        i -= 2;
  850|  2.38M|      } while (i);
  ------------------
  |  Branch (850:16): [True: 2.29M, False: 90.4k]
  ------------------
  851|  90.4k|    } else if (w == 16) {
  ------------------
  |  Branch (851:16): [True: 59.7k, False: 18.4E]
  ------------------
  852|   293k|      do {
  853|   293k|        DO_NO_AVG_2D_COPY_4X16(0, 0, 1, 0, 2, 0, 3, 0);
  ------------------
  |  |  796|   293k|  do {                                                                  \
  |  |  797|   293k|    src_0 = _mm256_cvtepu8_epi16(                                       \
  |  |  798|   293k|        _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0])));      \
  |  |  799|   293k|    src_1 = _mm256_cvtepu8_epi16(                                       \
  |  |  800|   293k|        _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1])));      \
  |  |  801|   293k|    src_2 = _mm256_cvtepu8_epi16(                                       \
  |  |  802|   293k|        _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2])));      \
  |  |  803|   293k|    src_3 = _mm256_cvtepu8_epi16(                                       \
  |  |  804|   293k|        _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3])));      \
  |  |  805|   293k|                                                                        \
  |  |  806|   293k|    src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|   293k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|   293k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  807|   293k|    src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|   293k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|   293k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  808|   293k|    src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|   293k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|   293k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  809|   293k|    src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT);                       \
  |  |  ------------------
  |  |  |  |  822|   293k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  |  |  ------------------
  |  |  |  |  |  |   21|   293k|#define FILTER_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  810|   293k|                                                                        \
  |  |  811|   293k|    src_0 = _mm256_add_epi16(src_0, offset_const);                      \
  |  |  812|   293k|    src_1 = _mm256_add_epi16(src_1, offset_const);                      \
  |  |  813|   293k|    src_2 = _mm256_add_epi16(src_2, offset_const);                      \
  |  |  814|   293k|    src_3 = _mm256_add_epi16(src_3, offset_const);                      \
  |  |  815|   293k|                                                                        \
  |  |  816|   293k|    _mm256_store_si256((__m256i *)(&dst[r0 * dst_stride + c0]), src_0); \
  |  |  817|   293k|    _mm256_store_si256((__m256i *)(&dst[r1 * dst_stride + c1]), src_1); \
  |  |  818|   293k|    _mm256_store_si256((__m256i *)(&dst[r2 * dst_stride + c2]), src_2); \
  |  |  819|   293k|    _mm256_store_si256((__m256i *)(&dst[r3 * dst_stride + c3]), src_3); \
  |  |  820|   293k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (820:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
  854|   293k|        src += 4 * src_stride;
  855|   293k|        dst += 4 * dst_stride;
  856|   293k|        i -= 4;
  857|   293k|      } while (i);
  ------------------
  |  Branch (857:16): [True: 233k, False: 59.7k]
  ------------------
  858|  59.7k|    }
  859|   385k|  } else {
  860|   149k|    const __m256i zero = _mm256_setzero_si256();
  861|   385k|    do {
  862|   385k|      const __m128i src_row_0 =
  863|   385k|          _mm_loadl_epi64((__m128i *)(&src[0 * src_stride]));
  864|   385k|      const __m128i src_row_1 =
  865|   385k|          _mm_loadl_epi64((__m128i *)(&src[1 * src_stride]));
  866|   385k|      const __m128i src_row_2 =
  867|   385k|          _mm_loadl_epi64((__m128i *)(&src[2 * src_stride]));
  868|   385k|      const __m128i src_row_3 =
  869|   385k|          _mm_loadl_epi64((__m128i *)(&src[3 * src_stride]));
  870|       |
  871|   385k|      __m256i src_10 = _mm256_insertf128_si256(
  872|   385k|          _mm256_castsi128_si256(src_row_0), src_row_1, 1);
  873|   385k|      __m256i src_32 = _mm256_insertf128_si256(
  874|   385k|          _mm256_castsi128_si256(src_row_2), src_row_3, 1);
  875|       |
  876|   385k|      src_10 = _mm256_unpacklo_epi8(src_10, zero);
  877|   385k|      src_32 = _mm256_unpacklo_epi8(src_32, zero);
  878|       |
  879|   385k|      src_10 = _mm256_slli_epi16(src_10, LEFT_SHIFT);
  ------------------
  |  |  822|   385k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  ------------------
  |  |  |  |   21|   385k|#define FILTER_BITS 7
  |  |  ------------------
  ------------------
  880|   385k|      src_32 = _mm256_slli_epi16(src_32, LEFT_SHIFT);
  ------------------
  |  |  822|   385k|#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
  |  |  ------------------
  |  |  |  |   21|   385k|#define FILTER_BITS 7
  |  |  ------------------
  ------------------
  881|       |
  882|   385k|      src_10 = _mm256_add_epi16(src_10, offset_const);
  883|   385k|      src_32 = _mm256_add_epi16(src_32, offset_const);
  884|       |
  885|       |      // Accumulate values into the destination buffer
  886|   385k|      _mm_store_si128((__m128i *)(&dst[0 * dst_stride]),
  887|   385k|                      _mm256_castsi256_si128(src_10));
  888|   385k|      _mm_store_si128((__m128i *)(&dst[1 * dst_stride]),
  889|   385k|                      _mm256_extracti128_si256(src_10, 1));
  890|   385k|      _mm_store_si128((__m128i *)(&dst[2 * dst_stride]),
  891|   385k|                      _mm256_castsi256_si128(src_32));
  892|   385k|      _mm_store_si128((__m128i *)(&dst[3 * dst_stride]),
  893|   385k|                      _mm256_extracti128_si256(src_32, 1));
  894|       |
  895|   385k|      src += 4 * src_stride;
  896|   385k|      dst += 4 * dst_stride;
  897|   385k|      i -= 4;
  898|   385k|    } while (i);
  ------------------
  |  Branch (898:14): [True: 235k, False: 149k]
  ------------------
  899|   149k|  }
  900|   534k|}

av1_build_compound_diffwtd_mask_d16_avx2:
  498|  82.3k|    ConvolveParams *conv_params, int bd) {
  499|  82.3k|  const int shift =
  500|  82.3k|      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
  ------------------
  |  |   21|  82.3k|#define FILTER_BITS 7
  ------------------
  501|       |  // When rounding constant is added, there is a possibility of overflow.
  502|       |  // However that much precision is not required. Code should very well work for
  503|       |  // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But
  504|       |  // there is a possibility of corner case bugs.
  505|  82.3k|  assert(DIFF_FACTOR_LOG2 == 4);
  506|  82.3k|  assert(AOM_BLEND_A64_MAX_ALPHA == 64);
  507|       |
  508|  82.3k|  if (mask_type == DIFFWTD_38) {
  ------------------
  |  Branch (508:7): [True: 43.1k, False: 39.2k]
  ------------------
  509|  43.1k|    build_compound_diffwtd_mask_d16_avx2(mask, src0, src0_stride, src1,
  510|  43.1k|                                         src1_stride, h, w, shift);
  511|  43.1k|  } else {
  512|  39.2k|    build_compound_diffwtd_mask_d16_inv_avx2(mask, src0, src0_stride, src1,
  513|  39.2k|                                             src1_stride, h, w, shift);
  514|  39.2k|  }
  515|  82.3k|}
reconinter_avx2.c:build_compound_diffwtd_mask_d16_avx2:
  174|  43.1k|    const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) {
  175|  43.1k|  const int mask_base = 38;
  176|  43.1k|  const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1);
  177|  43.1k|  const __m256i y38 = _mm256_set1_epi16(mask_base);
  178|  43.1k|  const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|  43.1k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|  43.1k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  179|  43.1k|  int i = 0;
  180|  43.1k|  if (w == 4) {
  ------------------
  |  Branch (180:7): [True: 0, False: 43.1k]
  ------------------
  181|      0|    do {
  182|      0|      const __m128i s0A = xx_loadl_64(src0);
  183|      0|      const __m128i s0B = xx_loadl_64(src0 + src0_stride);
  184|      0|      const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
  185|      0|      const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
  186|      0|      const __m128i s1A = xx_loadl_64(src1);
  187|      0|      const __m128i s1B = xx_loadl_64(src1 + src1_stride);
  188|      0|      const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
  189|      0|      const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
  190|      0|      const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D),
  191|      0|                                      _mm_unpacklo_epi64(s0A, s0B));
  192|      0|      const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D),
  193|      0|                                      _mm_unpacklo_epi64(s1A, s1B));
  194|      0|      const __m256i m16 = calc_mask_d16_avx2(&s0, &s1, &_r, &y38, &y64, shift);
  195|      0|      const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
  196|      0|      xx_storeu_128(mask,
  197|      0|                    _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)));
  198|      0|      src0 += src0_stride << 2;
  199|      0|      src1 += src1_stride << 2;
  200|      0|      mask += 16;
  201|      0|      i += 4;
  202|      0|    } while (i < h);
  ------------------
  |  Branch (202:14): [True: 0, False: 0]
  ------------------
  203|  43.1k|  } else if (w == 8) {
  ------------------
  |  Branch (203:14): [True: 8.59k, False: 34.5k]
  ------------------
  204|  34.2k|    do {
  205|  34.2k|      const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0);
  206|  34.2k|      const __m256i s0CD =
  207|  34.2k|          yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2);
  208|  34.2k|      const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1);
  209|  34.2k|      const __m256i s1CD =
  210|  34.2k|          yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2);
  211|  34.2k|      const __m256i m16AB =
  212|  34.2k|          calc_mask_d16_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift);
  213|  34.2k|      const __m256i m16CD =
  214|  34.2k|          calc_mask_d16_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift);
  215|  34.2k|      const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD);
  216|  34.2k|      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
  217|  34.2k|      src0 += src0_stride << 2;
  218|  34.2k|      src1 += src1_stride << 2;
  219|  34.2k|      mask += 32;
  220|  34.2k|      i += 4;
  221|  34.2k|    } while (i < h);
  ------------------
  |  Branch (221:14): [True: 25.7k, False: 8.59k]
  ------------------
  222|  34.5k|  } else if (w == 16) {
  ------------------
  |  Branch (222:14): [True: 15.0k, False: 19.5k]
  ------------------
  223|   134k|    do {
  224|   134k|      const __m256i s0A = yy_loadu_256(src0);
  225|   134k|      const __m256i s0B = yy_loadu_256(src0 + src0_stride);
  226|   134k|      const __m256i s1A = yy_loadu_256(src1);
  227|   134k|      const __m256i s1B = yy_loadu_256(src1 + src1_stride);
  228|   134k|      const __m256i m16A =
  229|   134k|          calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
  230|   134k|      const __m256i m16B =
  231|   134k|          calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
  232|   134k|      const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
  233|   134k|      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
  234|   134k|      src0 += src0_stride << 1;
  235|   134k|      src1 += src1_stride << 1;
  236|   134k|      mask += 32;
  237|   134k|      i += 2;
  238|   134k|    } while (i < h);
  ------------------
  |  Branch (238:14): [True: 118k, False: 15.0k]
  ------------------
  239|  19.5k|  } else if (w == 32) {
  ------------------
  |  Branch (239:14): [True: 13.1k, False: 6.41k]
  ------------------
  240|   309k|    do {
  241|   309k|      const __m256i s0A = yy_loadu_256(src0);
  242|   309k|      const __m256i s0B = yy_loadu_256(src0 + 16);
  243|   309k|      const __m256i s1A = yy_loadu_256(src1);
  244|   309k|      const __m256i s1B = yy_loadu_256(src1 + 16);
  245|   309k|      const __m256i m16A =
  246|   309k|          calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
  247|   309k|      const __m256i m16B =
  248|   309k|          calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
  249|   309k|      const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
  250|   309k|      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
  251|   309k|      src0 += src0_stride;
  252|   309k|      src1 += src1_stride;
  253|   309k|      mask += 32;
  254|   309k|      i += 1;
  255|   309k|    } while (i < h);
  ------------------
  |  Branch (255:14): [True: 296k, False: 13.1k]
  ------------------
  256|  13.1k|  } else if (w == 64) {
  ------------------
  |  Branch (256:14): [True: 4.83k, False: 1.58k]
  ------------------
  257|   215k|    do {
  258|   215k|      const __m256i s0A = yy_loadu_256(src0);
  259|   215k|      const __m256i s0B = yy_loadu_256(src0 + 16);
  260|   215k|      const __m256i s0C = yy_loadu_256(src0 + 32);
  261|   215k|      const __m256i s0D = yy_loadu_256(src0 + 48);
  262|   215k|      const __m256i s1A = yy_loadu_256(src1);
  263|   215k|      const __m256i s1B = yy_loadu_256(src1 + 16);
  264|   215k|      const __m256i s1C = yy_loadu_256(src1 + 32);
  265|   215k|      const __m256i s1D = yy_loadu_256(src1 + 48);
  266|   215k|      const __m256i m16A =
  267|   215k|          calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
  268|   215k|      const __m256i m16B =
  269|   215k|          calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
  270|   215k|      const __m256i m16C =
  271|   215k|          calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
  272|   215k|      const __m256i m16D =
  273|   215k|          calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
  274|   215k|      const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
  275|   215k|      const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
  276|   215k|      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
  277|   215k|      yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
  278|   215k|      src0 += src0_stride;
  279|   215k|      src1 += src1_stride;
  280|   215k|      mask += 64;
  281|   215k|      i += 1;
  282|   215k|    } while (i < h);
  ------------------
  |  Branch (282:14): [True: 210k, False: 4.83k]
  ------------------
  283|  4.83k|  } else {
  284|   159k|    do {
  285|   159k|      const __m256i s0A = yy_loadu_256(src0);
  286|   159k|      const __m256i s0B = yy_loadu_256(src0 + 16);
  287|   159k|      const __m256i s0C = yy_loadu_256(src0 + 32);
  288|   159k|      const __m256i s0D = yy_loadu_256(src0 + 48);
  289|   159k|      const __m256i s0E = yy_loadu_256(src0 + 64);
  290|   159k|      const __m256i s0F = yy_loadu_256(src0 + 80);
  291|   159k|      const __m256i s0G = yy_loadu_256(src0 + 96);
  292|   159k|      const __m256i s0H = yy_loadu_256(src0 + 112);
  293|   159k|      const __m256i s1A = yy_loadu_256(src1);
  294|   159k|      const __m256i s1B = yy_loadu_256(src1 + 16);
  295|   159k|      const __m256i s1C = yy_loadu_256(src1 + 32);
  296|   159k|      const __m256i s1D = yy_loadu_256(src1 + 48);
  297|   159k|      const __m256i s1E = yy_loadu_256(src1 + 64);
  298|   159k|      const __m256i s1F = yy_loadu_256(src1 + 80);
  299|   159k|      const __m256i s1G = yy_loadu_256(src1 + 96);
  300|   159k|      const __m256i s1H = yy_loadu_256(src1 + 112);
  301|   159k|      const __m256i m16A =
  302|   159k|          calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
  303|   159k|      const __m256i m16B =
  304|   159k|          calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
  305|   159k|      const __m256i m16C =
  306|   159k|          calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
  307|   159k|      const __m256i m16D =
  308|   159k|          calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
  309|   159k|      const __m256i m16E =
  310|   159k|          calc_mask_d16_avx2(&s0E, &s1E, &_r, &y38, &y64, shift);
  311|   159k|      const __m256i m16F =
  312|   159k|          calc_mask_d16_avx2(&s0F, &s1F, &_r, &y38, &y64, shift);
  313|   159k|      const __m256i m16G =
  314|   159k|          calc_mask_d16_avx2(&s0G, &s1G, &_r, &y38, &y64, shift);
  315|   159k|      const __m256i m16H =
  316|   159k|          calc_mask_d16_avx2(&s0H, &s1H, &_r, &y38, &y64, shift);
  317|   159k|      const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
  318|   159k|      const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
  319|   159k|      const __m256i m8EF = _mm256_packus_epi16(m16E, m16F);
  320|   159k|      const __m256i m8GH = _mm256_packus_epi16(m16G, m16H);
  321|   159k|      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
  322|   159k|      yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
  323|   159k|      yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8));
  324|   159k|      yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8));
  325|   159k|      src0 += src0_stride;
  326|   159k|      src1 += src1_stride;
  327|   159k|      mask += 128;
  328|   159k|      i += 1;
  329|   159k|    } while (i < h);
  ------------------
  |  Branch (329:14): [True: 157k, False: 1.58k]
  ------------------
  330|  1.58k|  }
  331|  43.1k|}
reconinter_avx2.c:calc_mask_d16_avx2:
  142|  3.08M|                                         const __m256i *clip_diff, int round) {
  143|  3.08M|  const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1);
  144|  3.08M|  const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0);
  145|  3.08M|  const __m256i diff = _mm256_max_epu16(diffa, diffb);
  146|  3.08M|  const __m256i diff_round =
  147|  3.08M|      _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round);
  148|  3.08M|  const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
  ------------------
  |  |   42|  3.08M|#define DIFF_FACTOR_LOG2 4
  ------------------
  149|  3.08M|  const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16);
  150|  3.08M|  const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff);
  151|  3.08M|  return diff_clamp;
  152|  3.08M|}
reconinter_avx2.c:build_compound_diffwtd_mask_d16_inv_avx2:
  335|  39.2k|    const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) {
  336|  39.2k|  const int mask_base = 38;
  337|  39.2k|  const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1);
  338|  39.2k|  const __m256i y38 = _mm256_set1_epi16(mask_base);
  339|  39.2k|  const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
  ------------------
  |  |   24|  39.2k|#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
  |  |  ------------------
  |  |  |  |   23|  39.2k|#define AOM_BLEND_A64_ROUND_BITS 6
  |  |  ------------------
  ------------------
  340|  39.2k|  int i = 0;
  341|  39.2k|  if (w == 4) {
  ------------------
  |  Branch (341:7): [True: 0, False: 39.2k]
  ------------------
  342|      0|    do {
  343|      0|      const __m128i s0A = xx_loadl_64(src0);
  344|      0|      const __m128i s0B = xx_loadl_64(src0 + src0_stride);
  345|      0|      const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
  346|      0|      const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
  347|      0|      const __m128i s1A = xx_loadl_64(src1);
  348|      0|      const __m128i s1B = xx_loadl_64(src1 + src1_stride);
  349|      0|      const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
  350|      0|      const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
  351|      0|      const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D),
  352|      0|                                      _mm_unpacklo_epi64(s0A, s0B));
  353|      0|      const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D),
  354|      0|                                      _mm_unpacklo_epi64(s1A, s1B));
  355|      0|      const __m256i m16 =
  356|      0|          calc_mask_d16_inv_avx2(&s0, &s1, &_r, &y38, &y64, shift);
  357|      0|      const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
  358|      0|      xx_storeu_128(mask,
  359|      0|                    _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)));
  360|      0|      src0 += src0_stride << 2;
  361|      0|      src1 += src1_stride << 2;
  362|      0|      mask += 16;
  363|      0|      i += 4;
  364|      0|    } while (i < h);
  ------------------
  |  Branch (364:14): [True: 0, False: 0]
  ------------------
  365|  39.2k|  } else if (w == 8) {
  ------------------
  |  Branch (365:14): [True: 8.94k, False: 30.2k]
  ------------------
  366|  38.0k|    do {
  367|  38.0k|      const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0);
  368|  38.0k|      const __m256i s0CD =
  369|  38.0k|          yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2);
  370|  38.0k|      const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1);
  371|  38.0k|      const __m256i s1CD =
  372|  38.0k|          yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2);
  373|  38.0k|      const __m256i m16AB =
  374|  38.0k|          calc_mask_d16_inv_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift);
  375|  38.0k|      const __m256i m16CD =
  376|  38.0k|          calc_mask_d16_inv_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift);
  377|  38.0k|      const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD);
  378|  38.0k|      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
  379|  38.0k|      src0 += src0_stride << 2;
  380|  38.0k|      src1 += src1_stride << 2;
  381|  38.0k|      mask += 32;
  382|  38.0k|      i += 4;
  383|  38.0k|    } while (i < h);
  ------------------
  |  Branch (383:14): [True: 29.1k, False: 8.94k]
  ------------------
  384|  30.2k|  } else if (w == 16) {
  ------------------
  |  Branch (384:14): [True: 17.2k, False: 12.9k]
  ------------------
  385|   260k|    do {
  386|   260k|      const __m256i s0A = yy_loadu_256(src0);
  387|   260k|      const __m256i s0B = yy_loadu_256(src0 + src0_stride);
  388|   260k|      const __m256i s1A = yy_loadu_256(src1);
  389|   260k|      const __m256i s1B = yy_loadu_256(src1 + src1_stride);
  390|   260k|      const __m256i m16A =
  391|   260k|          calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
  392|   260k|      const __m256i m16B =
  393|   260k|          calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
  394|   260k|      const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
  395|   260k|      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
  396|   260k|      src0 += src0_stride << 1;
  397|   260k|      src1 += src1_stride << 1;
  398|   260k|      mask += 32;
  399|   260k|      i += 2;
  400|   260k|    } while (i < h);
  ------------------
  |  Branch (400:14): [True: 243k, False: 17.2k]
  ------------------
  401|  17.2k|  } else if (w == 32) {
  ------------------
  |  Branch (401:14): [True: 8.42k, False: 4.56k]
  ------------------
  402|   182k|    do {
  403|   182k|      const __m256i s0A = yy_loadu_256(src0);
  404|   182k|      const __m256i s0B = yy_loadu_256(src0 + 16);
  405|   182k|      const __m256i s1A = yy_loadu_256(src1);
  406|   182k|      const __m256i s1B = yy_loadu_256(src1 + 16);
  407|   182k|      const __m256i m16A =
  408|   182k|          calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
  409|   182k|      const __m256i m16B =
  410|   182k|          calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
  411|   182k|      const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
  412|   182k|      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
  413|   182k|      src0 += src0_stride;
  414|   182k|      src1 += src1_stride;
  415|   182k|      mask += 32;
  416|   182k|      i += 1;
  417|   182k|    } while (i < h);
  ------------------
  |  Branch (417:14): [True: 173k, False: 8.42k]
  ------------------
  418|  8.42k|  } else if (w == 64) {
  ------------------
  |  Branch (418:14): [True: 3.80k, False: 763]
  ------------------
  419|   209k|    do {
  420|   209k|      const __m256i s0A = yy_loadu_256(src0);
  421|   209k|      const __m256i s0B = yy_loadu_256(src0 + 16);
  422|   209k|      const __m256i s0C = yy_loadu_256(src0 + 32);
  423|   209k|      const __m256i s0D = yy_loadu_256(src0 + 48);
  424|   209k|      const __m256i s1A = yy_loadu_256(src1);
  425|   209k|      const __m256i s1B = yy_loadu_256(src1 + 16);
  426|   209k|      const __m256i s1C = yy_loadu_256(src1 + 32);
  427|   209k|      const __m256i s1D = yy_loadu_256(src1 + 48);
  428|   209k|      const __m256i m16A =
  429|   209k|          calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
  430|   209k|      const __m256i m16B =
  431|   209k|          calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
  432|   209k|      const __m256i m16C =
  433|   209k|          calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
  434|   209k|      const __m256i m16D =
  435|   209k|          calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
  436|   209k|      const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
  437|   209k|      const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
  438|   209k|      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
  439|   209k|      yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
  440|   209k|      src0 += src0_stride;
  441|   209k|      src1 += src1_stride;
  442|   209k|      mask += 64;
  443|   209k|      i += 1;
  444|   209k|    } while (i < h);
  ------------------
  |  Branch (444:14): [True: 205k, False: 3.80k]
  ------------------
  445|  3.80k|  } else {
  446|  82.6k|    do {
  447|  82.6k|      const __m256i s0A = yy_loadu_256(src0);
  448|  82.6k|      const __m256i s0B = yy_loadu_256(src0 + 16);
  449|  82.6k|      const __m256i s0C = yy_loadu_256(src0 + 32);
  450|  82.6k|      const __m256i s0D = yy_loadu_256(src0 + 48);
  451|  82.6k|      const __m256i s0E = yy_loadu_256(src0 + 64);
  452|  82.6k|      const __m256i s0F = yy_loadu_256(src0 + 80);
  453|  82.6k|      const __m256i s0G = yy_loadu_256(src0 + 96);
  454|  82.6k|      const __m256i s0H = yy_loadu_256(src0 + 112);
  455|  82.6k|      const __m256i s1A = yy_loadu_256(src1);
  456|  82.6k|      const __m256i s1B = yy_loadu_256(src1 + 16);
  457|  82.6k|      const __m256i s1C = yy_loadu_256(src1 + 32);
  458|  82.6k|      const __m256i s1D = yy_loadu_256(src1 + 48);
  459|  82.6k|      const __m256i s1E = yy_loadu_256(src1 + 64);
  460|  82.6k|      const __m256i s1F = yy_loadu_256(src1 + 80);
  461|  82.6k|      const __m256i s1G = yy_loadu_256(src1 + 96);
  462|  82.6k|      const __m256i s1H = yy_loadu_256(src1 + 112);
  463|  82.6k|      const __m256i m16A =
  464|  82.6k|          calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
  465|  82.6k|      const __m256i m16B =
  466|  82.6k|          calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
  467|  82.6k|      const __m256i m16C =
  468|  82.6k|          calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
  469|  82.6k|      const __m256i m16D =
  470|  82.6k|          calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
  471|  82.6k|      const __m256i m16E =
  472|  82.6k|          calc_mask_d16_inv_avx2(&s0E, &s1E, &_r, &y38, &y64, shift);
  473|  82.6k|      const __m256i m16F =
  474|  82.6k|          calc_mask_d16_inv_avx2(&s0F, &s1F, &_r, &y38, &y64, shift);
  475|  82.6k|      const __m256i m16G =
  476|  82.6k|          calc_mask_d16_inv_avx2(&s0G, &s1G, &_r, &y38, &y64, shift);
  477|  82.6k|      const __m256i m16H =
  478|  82.6k|          calc_mask_d16_inv_avx2(&s0H, &s1H, &_r, &y38, &y64, shift);
  479|  82.6k|      const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
  480|  82.6k|      const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
  481|  82.6k|      const __m256i m8EF = _mm256_packus_epi16(m16E, m16F);
  482|  82.6k|      const __m256i m8GH = _mm256_packus_epi16(m16G, m16H);
  483|  82.6k|      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
  484|  82.6k|      yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
  485|  82.6k|      yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8));
  486|  82.6k|      yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8));
  487|  82.6k|      src0 += src0_stride;
  488|  82.6k|      src1 += src1_stride;
  489|  82.6k|      mask += 128;
  490|  82.6k|      i += 1;
  491|  82.6k|    } while (i < h);
  ------------------
  |  Branch (491:14): [True: 81.9k, False: 763]
  ------------------
  492|    763|  }
  493|  39.2k|}
reconinter_avx2.c:calc_mask_d16_inv_avx2:
  159|  2.45M|                                             int round) {
  160|  2.45M|  const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1);
  161|  2.45M|  const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0);
  162|  2.45M|  const __m256i diff = _mm256_max_epu16(diffa, diffb);
  163|  2.45M|  const __m256i diff_round =
  164|  2.45M|      _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round);
  165|  2.45M|  const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
  ------------------
  |  |   42|  2.45M|#define DIFF_FACTOR_LOG2 4
  ------------------
  166|  2.45M|  const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16);
  167|  2.45M|  const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff);
  168|  2.45M|  const __m256i diff_const_16 = _mm256_sub_epi16(*clip_diff, diff_clamp);
  169|  2.45M|  return diff_const_16;
  170|  2.45M|}

av1_selfguided_restoration_avx2:
  553|   550k|                                    int highbd) {
  554|       |  // The ALIGN_POWER_OF_TWO macro here ensures that column 1 of Atl, Btl,
  555|       |  // Ctl and Dtl is 32-byte aligned.
  556|   550k|  const int buf_elts = ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3);
  ------------------
  |  |   69|   550k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
  557|       |
  558|   550k|  int32_t *buf = aom_memalign(
  559|   550k|      32, 4 * sizeof(*buf) * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3));
  ------------------
  |  |   69|   550k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
  560|   550k|  if (!buf) return -1;
  ------------------
  |  Branch (560:7): [True: 0, False: 550k]
  ------------------
  561|       |
  562|   550k|  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
  ------------------
  |  |   40|   550k|#define SGRPROJ_BORDER_HORZ 3  // Horizontal border used for Sgr
  ------------------
  563|   550k|  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
  ------------------
  |  |   39|   550k|#define SGRPROJ_BORDER_VERT 3  // Vertical border used for Sgr
  ------------------
  564|       |
  565|       |  // Adjusting the stride of A and B here appears to avoid bad cache effects,
  566|       |  // leading to a significant speed improvement.
  567|       |  // We also align the stride to a multiple of 32 bytes for efficiency.
  568|   550k|  int buf_stride = ALIGN_POWER_OF_TWO(width_ext + 16, 3);
  ------------------
  |  |   69|   550k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
  569|       |
  570|       |  // The "tl" pointers point at the top-left of the initialised data for the
  571|       |  // array.
  572|   550k|  int32_t *Atl = buf + 0 * buf_elts + 7;
  573|   550k|  int32_t *Btl = buf + 1 * buf_elts + 7;
  574|   550k|  int32_t *Ctl = buf + 2 * buf_elts + 7;
  575|   550k|  int32_t *Dtl = buf + 3 * buf_elts + 7;
  576|       |
  577|       |  // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note
  578|       |  // there's a zero row and column in A, B (integral images), so we move down
  579|       |  // and right one for them.
  580|   550k|  const int buf_diag_border =
  581|   550k|      SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT;
  ------------------
  |  |   40|   550k|#define SGRPROJ_BORDER_HORZ 3  // Horizontal border used for Sgr
  ------------------
                    SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT;
  ------------------
  |  |   39|   550k|#define SGRPROJ_BORDER_VERT 3  // Vertical border used for Sgr
  ------------------
  582|       |
  583|   550k|  int32_t *A0 = Atl + 1 + buf_stride;
  584|   550k|  int32_t *B0 = Btl + 1 + buf_stride;
  585|   550k|  int32_t *C0 = Ctl + 1 + buf_stride;
  586|   550k|  int32_t *D0 = Dtl + 1 + buf_stride;
  587|       |
  588|       |  // Finally, A, B, C, D point at position (0, 0).
  589|   550k|  int32_t *A = A0 + buf_diag_border;
  590|   550k|  int32_t *B = B0 + buf_diag_border;
  591|   550k|  int32_t *C = C0 + buf_diag_border;
  592|   550k|  int32_t *D = D0 + buf_diag_border;
  593|       |
  594|   550k|  const int dgd_diag_border =
  595|   550k|      SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT;
  ------------------
  |  |   40|   550k|#define SGRPROJ_BORDER_HORZ 3  // Horizontal border used for Sgr
  ------------------
                    SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT;
  ------------------
  |  |   39|   550k|#define SGRPROJ_BORDER_VERT 3  // Vertical border used for Sgr
  ------------------
  596|   550k|  const uint8_t *dgd0 = dgd8 - dgd_diag_border;
  597|       |
  598|       |  // Generate integral images from the input. C will contain sums of squares; D
  599|       |  // will contain just sums
  600|   550k|  if (highbd)
  ------------------
  |  Branch (600:7): [True: 286k, False: 264k]
  ------------------
  601|   286k|    integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext,
  ------------------
  |  |   75|   286k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  602|   286k|                           height_ext, Ctl, Dtl, buf_stride);
  603|   264k|  else
  604|   264k|    integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
  605|   264k|                    buf_stride);
  606|       |
  607|   550k|  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
  608|       |  // Write to flt0 and flt1
  609|       |  // If params->r == 0 we skip the corresponding filter. We only allow one of
  610|       |  // the radii to be 0, as having both equal to 0 would be equivalent to
  611|       |  // skipping SGR entirely.
  612|   550k|  assert(!(params->r[0] == 0 && params->r[1] == 0));
  613|   550k|  assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
  614|   550k|  assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
  615|       |
  616|   550k|  if (params->r[0] > 0) {
  ------------------
  |  Branch (616:7): [True: 470k, False: 80.0k]
  ------------------
  617|   470k|    calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth,
  618|   470k|                 sgr_params_idx, 0);
  619|   470k|    final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
  620|   470k|                      width, height, highbd);
  621|   470k|  }
  622|       |
  623|   550k|  if (params->r[1] > 0) {
  ------------------
  |  Branch (623:7): [True: 519k, False: 30.2k]
  ------------------
  624|   519k|    calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx,
  625|   519k|            1);
  626|   519k|    final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
  627|   519k|                 height, highbd);
  628|   519k|  }
  629|   550k|  aom_free(buf);
  630|   550k|  return 0;
  631|   550k|}
av1_apply_selfguided_restoration_avx2:
  637|   550k|                                          int bit_depth, int highbd) {
  638|   550k|  int32_t *flt0 = tmpbuf;
  639|   550k|  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
  ------------------
  |  |   87|   550k|  (RESTORATION_UNITPELS_HORZ_MAX * RESTORATION_UNITPELS_VERT_MAX)
  |  |  ------------------
  |  |  |  |   82|   550k|  (RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16)
  |  |  |  |  ------------------
  |  |  |  |  |  |   80|   550k|#define RESTORATION_UNITSIZE_MAX 256
  |  |  |  |  ------------------
  |  |  |  |                 (RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16)
  |  |  |  |  ------------------
  |  |  |  |  |  |   56|   550k|#define RESTORATION_BORDER_HORZ (SGRPROJ_BORDER_HORZ)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   40|   550k|#define SGRPROJ_BORDER_HORZ 3  // Horizontal border used for Sgr
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (RESTORATION_UNITPELS_HORZ_MAX * RESTORATION_UNITPELS_VERT_MAX)
  |  |  ------------------
  |  |  |  |   84|   550k|  ((RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT + \
  |  |  |  |  ------------------
  |  |  |  |  |  |   80|   550k|#define RESTORATION_UNITSIZE_MAX 256
  |  |  |  |  ------------------
  |  |  |  |                 ((RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT + \
  |  |  |  |  ------------------
  |  |  |  |  |  |   50|   550k|#define RESTORATION_BORDER_VERT (SGRPROJ_BORDER_VERT)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   39|   550k|#define SGRPROJ_BORDER_VERT 3  // Vertical border used for Sgr
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  |  |   85|   550k|    RESTORATION_UNIT_OFFSET))
  |  |  |  |  ------------------
  |  |  |  |  |  |   37|   550k|#define RESTORATION_UNIT_OFFSET 8
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  640|   550k|  assert(width * height <= RESTORATION_UNITPELS_MAX);
  641|   550k|  const int ret = av1_selfguided_restoration_avx2(
  642|   550k|      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
  643|   550k|  if (ret != 0) return ret;
  ------------------
  |  Branch (643:7): [True: 0, False: 550k]
  ------------------
  644|   550k|  const sgr_params_type *const params = &av1_sgr_params[eps];
  645|   550k|  int xq[2];
  646|   550k|  av1_decode_xq(xqd, xq, params);
  647|       |
  648|   550k|  __m256i xq0 = _mm256_set1_epi32(xq[0]);
  649|   550k|  __m256i xq1 = _mm256_set1_epi32(xq[1]);
  650|       |
  651|  24.1M|  for (int i = 0; i < height; ++i) {
  ------------------
  |  Branch (651:19): [True: 23.6M, False: 550k]
  ------------------
  652|       |    // Calculate output in batches of 16 pixels
  653|   106M|    for (int j = 0; j < width; j += 16) {
  ------------------
  |  Branch (653:21): [True: 82.7M, False: 23.6M]
  ------------------
  654|  82.7M|      const int k = i * width + j;
  655|  82.7M|      const int m = i * dst_stride + j;
  656|       |
  657|  82.7M|      const uint8_t *dat8ij = dat8 + i * stride + j;
  658|  82.7M|      __m256i ep_0, ep_1;
  659|  82.7M|      __m128i src_0, src_1;
  660|  82.7M|      if (highbd) {
  ------------------
  |  Branch (660:11): [True: 42.9M, False: 39.8M]
  ------------------
  661|  42.9M|        src_0 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij));
  ------------------
  |  |   75|  42.9M|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  662|  42.9M|        src_1 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij + 8));
  ------------------
  |  |   75|  42.9M|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  663|  42.9M|        ep_0 = _mm256_cvtepu16_epi32(src_0);
  664|  42.9M|        ep_1 = _mm256_cvtepu16_epi32(src_1);
  665|  42.9M|      } else {
  666|  39.8M|        src_0 = xx_loadu_128(dat8ij);
  667|  39.8M|        ep_0 = _mm256_cvtepu8_epi32(src_0);
  668|  39.8M|        ep_1 = _mm256_cvtepu8_epi32(_mm_srli_si128(src_0, 8));
  669|  39.8M|      }
  670|       |
  671|  82.7M|      const __m256i u_0 = _mm256_slli_epi32(ep_0, SGRPROJ_RST_BITS);
  ------------------
  |  |  101|  82.7M|#define SGRPROJ_RST_BITS 4
  ------------------
  672|  82.7M|      const __m256i u_1 = _mm256_slli_epi32(ep_1, SGRPROJ_RST_BITS);
  ------------------
  |  |  101|  82.7M|#define SGRPROJ_RST_BITS 4
  ------------------
  673|       |
  674|  82.7M|      __m256i v_0 = _mm256_slli_epi32(u_0, SGRPROJ_PRJ_BITS);
  ------------------
  |  |   99|  82.7M|#define SGRPROJ_PRJ_BITS 7
  ------------------
  675|  82.7M|      __m256i v_1 = _mm256_slli_epi32(u_1, SGRPROJ_PRJ_BITS);
  ------------------
  |  |   99|  82.7M|#define SGRPROJ_PRJ_BITS 7
  ------------------
  676|       |
  677|  82.7M|      if (params->r[0] > 0) {
  ------------------
  |  Branch (677:11): [True: 76.9M, False: 5.78M]
  ------------------
  678|  76.9M|        const __m256i f1_0 = _mm256_sub_epi32(yy_loadu_256(&flt0[k]), u_0);
  679|  76.9M|        v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq0, f1_0));
  680|       |
  681|  76.9M|        const __m256i f1_1 = _mm256_sub_epi32(yy_loadu_256(&flt0[k + 8]), u_1);
  682|  76.9M|        v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq0, f1_1));
  683|  76.9M|      }
  684|       |
  685|  82.7M|      if (params->r[1] > 0) {
  ------------------
  |  Branch (685:11): [True: 76.7M, False: 6.02M]
  ------------------
  686|  76.7M|        const __m256i f2_0 = _mm256_sub_epi32(yy_loadu_256(&flt1[k]), u_0);
  687|  76.7M|        v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq1, f2_0));
  688|       |
  689|  76.7M|        const __m256i f2_1 = _mm256_sub_epi32(yy_loadu_256(&flt1[k + 8]), u_1);
  690|  76.7M|        v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq1, f2_1));
  691|  76.7M|      }
  692|       |
  693|  82.7M|      const __m256i rounding =
  694|  82.7M|          round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
  ------------------
  |  |   99|  82.7M|#define SGRPROJ_PRJ_BITS 7
  ------------------
                        round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
  ------------------
  |  |  101|  82.7M|#define SGRPROJ_RST_BITS 4
  ------------------
  695|  82.7M|      const __m256i w_0 = _mm256_srai_epi32(
  696|  82.7M|          _mm256_add_epi32(v_0, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
  ------------------
  |  |   99|  82.7M|#define SGRPROJ_PRJ_BITS 7
  ------------------
                        _mm256_add_epi32(v_0, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
  ------------------
  |  |  101|  82.7M|#define SGRPROJ_RST_BITS 4
  ------------------
  697|  82.7M|      const __m256i w_1 = _mm256_srai_epi32(
  698|  82.7M|          _mm256_add_epi32(v_1, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
  ------------------
  |  |   99|  82.7M|#define SGRPROJ_PRJ_BITS 7
  ------------------
                        _mm256_add_epi32(v_1, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
  ------------------
  |  |  101|  82.7M|#define SGRPROJ_RST_BITS 4
  ------------------
  699|       |
  700|  82.7M|      if (highbd) {
  ------------------
  |  Branch (700:11): [True: 42.8M, False: 39.8M]
  ------------------
  701|       |        // Pack into 16 bits and clamp to [0, 2^bit_depth)
  702|       |        // Note that packing into 16 bits messes up the order of the bits,
  703|       |        // so we use a permute function to correct this
  704|  42.8M|        const __m256i tmp = _mm256_packus_epi32(w_0, w_1);
  705|  42.8M|        const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8);
  706|  42.8M|        const __m256i max = _mm256_set1_epi16((1 << bit_depth) - 1);
  707|  42.8M|        const __m256i res = _mm256_min_epi16(tmp2, max);
  708|  42.8M|        yy_storeu_256(CONVERT_TO_SHORTPTR(dst8 + m), res);
  ------------------
  |  |   75|  42.8M|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  709|  42.8M|      } else {
  710|       |        // Pack into 8 bits and clamp to [0, 256)
  711|       |        // Note that each pack messes up the order of the bits,
  712|       |        // so we use a permute function to correct this
  713|  39.8M|        const __m256i tmp = _mm256_packs_epi32(w_0, w_1);
  714|  39.8M|        const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8);
  715|  39.8M|        const __m256i res =
  716|  39.8M|            _mm256_packus_epi16(tmp2, tmp2 /* "don't care" value */);
  717|  39.8M|        const __m128i res2 =
  718|  39.8M|            _mm256_castsi256_si128(_mm256_permute4x64_epi64(res, 0xd8));
  719|  39.8M|        xx_storeu_128(dst8 + m, res2);
  720|  39.8M|      }
  721|  82.7M|    }
  722|  23.6M|  }
  723|   550k|  return 0;
  724|   550k|}
selfguided_avx2.c:integral_images_highbd:
  135|   286k|                                   int32_t *B, int buf_stride) {
  136|   286k|  const __m256i zero = _mm256_setzero_si256();
  137|       |  // Write out the zero top row
  138|   286k|  memset_zero_avx(A, &zero, (width + 8));
  139|   286k|  memset_zero_avx(B, &zero, (width + 8));
  140|       |
  141|  15.5M|  for (int i = 0; i < height; ++i) {
  ------------------
  |  Branch (141:19): [True: 15.2M, False: 286k]
  ------------------
  142|       |    // Zero the left column.
  143|  15.2M|    A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
  144|       |
  145|       |    // ldiff is the difference H - D where H is the output sample immediately
  146|       |    // to the left and D is the output sample above it. These are scalars,
  147|       |    // replicated across the eight lanes.
  148|  15.2M|    __m256i ldiff1 = zero, ldiff2 = zero;
  149|   143M|    for (int j = 0; j < width; j += 8) {
  ------------------
  |  Branch (149:21): [True: 128M, False: 15.2M]
  ------------------
  150|   128M|      const int ABj = 1 + j;
  151|       |
  152|   128M|      const __m256i above1 = yy_load_256(B + ABj + i * buf_stride);
  153|   128M|      const __m256i above2 = yy_load_256(A + ABj + i * buf_stride);
  154|       |
  155|   128M|      const __m256i x1 = yy256_load_extend_16_32(src + j + i * src_stride);
  156|   128M|      const __m256i x2 = _mm256_madd_epi16(x1, x1);
  157|       |
  158|   128M|      const __m256i sc1 = scan_32(x1);
  159|   128M|      const __m256i sc2 = scan_32(x2);
  160|       |
  161|   128M|      const __m256i row1 =
  162|   128M|          _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1);
  163|   128M|      const __m256i row2 =
  164|   128M|          _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2);
  165|       |
  166|   128M|      yy_store_256(B + ABj + (i + 1) * buf_stride, row1);
  167|   128M|      yy_store_256(A + ABj + (i + 1) * buf_stride, row2);
  168|       |
  169|       |      // Calculate the new H - D.
  170|   128M|      ldiff1 = _mm256_set1_epi32(
  171|   128M|          _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7));
  172|   128M|      ldiff2 = _mm256_set1_epi32(
  173|   128M|          _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7));
  174|   128M|    }
  175|  15.2M|  }
  176|   286k|}
selfguided_avx2.c:memset_zero_avx:
   69|  1.10M|static void *memset_zero_avx(int32_t *dest, const __m256i *zero, size_t count) {
   70|  1.10M|  unsigned int i = 0;
   71|  2.94M|  for (i = 0; i < (count & 0xffffffe0); i += 32) {
  ------------------
  |  Branch (71:15): [True: 1.84M, False: 1.10M]
  ------------------
   72|  1.84M|    _mm256_storeu_si256((__m256i *)(dest + i), *zero);
   73|  1.84M|    _mm256_storeu_si256((__m256i *)(dest + i + 8), *zero);
   74|  1.84M|    _mm256_storeu_si256((__m256i *)(dest + i + 16), *zero);
   75|  1.84M|    _mm256_storeu_si256((__m256i *)(dest + i + 24), *zero);
   76|  1.84M|  }
   77|  2.21M|  for (; i < (count & 0xfffffff8); i += 8) {
  ------------------
  |  Branch (77:10): [True: 1.10M, False: 1.10M]
  ------------------
   78|  1.10M|    _mm256_storeu_si256((__m256i *)(dest + i), *zero);
   79|  1.10M|  }
   80|  7.22M|  for (; i < count; i++) {
  ------------------
  |  Branch (80:10): [True: 6.12M, False: 1.10M]
  ------------------
   81|  6.12M|    dest[i] = 0;
   82|  6.12M|  }
   83|  1.10M|  return dest;
   84|  1.10M|}
selfguided_avx2.c:yy256_load_extend_16_32:
   29|   129M|static __m256i yy256_load_extend_16_32(const void *p) {
   30|   129M|  return _mm256_cvtepu16_epi32(xx_loadu_128(p));
   31|   129M|}
selfguided_avx2.c:scan_32:
   51|   405M|static __m256i scan_32(__m256i x) {
   52|   405M|  const __m256i x01 = _mm256_slli_si256(x, 4);
   53|   405M|  const __m256i x02 = _mm256_add_epi32(x, x01);
   54|   405M|  const __m256i x03 = _mm256_slli_si256(x02, 8);
   55|   405M|  const __m256i x04 = _mm256_add_epi32(x02, x03);
   56|   405M|  const int32_t s = _mm256_extract_epi32(x04, 3);
   57|   405M|  const __m128i s01 = _mm_set1_epi32(s);
   58|   405M|  const __m256i s02 = _mm256_insertf128_si256(_mm256_setzero_si256(), s01, 1);
   59|   405M|  return _mm256_add_epi32(x04, s02);
   60|   405M|}
selfguided_avx2.c:integral_images:
   88|   264k|                            int buf_stride) {
   89|   264k|  const __m256i zero = _mm256_setzero_si256();
   90|       |  // Write out the zero top row
   91|   264k|  memset_zero_avx(A, &zero, (width + 8));
   92|   264k|  memset_zero_avx(B, &zero, (width + 8));
   93|  11.5M|  for (int i = 0; i < height; ++i) {
  ------------------
  |  Branch (93:19): [True: 11.2M, False: 264k]
  ------------------
   94|       |    // Zero the left column.
   95|  11.2M|    A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
   96|       |
   97|       |    // ldiff is the difference H - D where H is the output sample immediately
   98|       |    // to the left and D is the output sample above it. These are scalars,
   99|       |    // replicated across the eight lanes.
  100|  11.2M|    __m256i ldiff1 = zero, ldiff2 = zero;
  101|  89.7M|    for (int j = 0; j < width; j += 8) {
  ------------------
  |  Branch (101:21): [True: 78.4M, False: 11.2M]
  ------------------
  102|  78.4M|      const int ABj = 1 + j;
  103|       |
  104|  78.4M|      const __m256i above1 = yy_load_256(B + ABj + i * buf_stride);
  105|  78.4M|      const __m256i above2 = yy_load_256(A + ABj + i * buf_stride);
  106|       |
  107|  78.4M|      const __m256i x1 = yy256_load_extend_8_32(src + j + i * src_stride);
  108|  78.4M|      const __m256i x2 = _mm256_madd_epi16(x1, x1);
  109|       |
  110|  78.4M|      const __m256i sc1 = scan_32(x1);
  111|  78.4M|      const __m256i sc2 = scan_32(x2);
  112|       |
  113|  78.4M|      const __m256i row1 =
  114|  78.4M|          _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1);
  115|  78.4M|      const __m256i row2 =
  116|  78.4M|          _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2);
  117|       |
  118|  78.4M|      yy_store_256(B + ABj + (i + 1) * buf_stride, row1);
  119|  78.4M|      yy_store_256(A + ABj + (i + 1) * buf_stride, row2);
  120|       |
  121|       |      // Calculate the new H - D.
  122|  78.4M|      ldiff1 = _mm256_set1_epi32(
  123|  78.4M|          _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7));
  124|  78.4M|      ldiff2 = _mm256_set1_epi32(
  125|  78.4M|          _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7));
  126|  78.4M|    }
  127|  11.2M|  }
  128|   264k|}
selfguided_avx2.c:yy256_load_extend_8_32:
   23|  77.6M|static __m256i yy256_load_extend_8_32(const void *p) {
   24|  77.6M|  return _mm256_cvtepu8_epi32(xx_loadl_64(p));
   25|  77.6M|}
selfguided_avx2.c:calc_ab_fast:
  358|   471k|                         int radius_idx) {
  359|   471k|  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
  360|   471k|  const int r = params->r[radius_idx];
  361|   471k|  const int n = (2 * r + 1) * (2 * r + 1);
  362|   471k|  const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
  363|       |  // one_over_n[n-1] is 2^12/n, so easily fits in an int16
  364|   471k|  const __m256i one_over_n = _mm256_set1_epi32(av1_one_by_x[n - 1]);
  365|       |
  366|   471k|  const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
  ------------------
  |  |  117|   471k|#define SGRPROJ_MTABLE_BITS 20
  ------------------
  367|   471k|  const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
  ------------------
  |  |  118|   471k|#define SGRPROJ_RECIP_BITS 12
  ------------------
  368|       |
  369|       |  // Set up masks
  370|   471k|  const __m128i ones32 = _mm_set_epi32(0, 0, ~0, ~0);
  371|   471k|  __m256i mask[8];
  372|  4.23M|  for (int idx = 0; idx < 8; idx++) {
  ------------------
  |  Branch (372:21): [True: 3.76M, False: 471k]
  ------------------
  373|  3.76M|    const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx));
  374|  3.76M|    mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
  375|  3.76M|  }
  376|       |
  377|  12.1M|  for (int i = -1; i < height + 1; i += 2) {
  ------------------
  |  Branch (377:20): [True: 11.6M, False: 521k]
  ------------------
  378|  87.3M|    for (int j = -1; j < width + 1; j += 8) {
  ------------------
  |  Branch (378:22): [True: 75.6M, False: 11.6M]
  ------------------
  379|  75.6M|      const int32_t *Cij = C + i * buf_stride + j;
  380|  75.6M|      const int32_t *Dij = D + i * buf_stride + j;
  381|       |
  382|  75.6M|      __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r);
  383|  75.6M|      __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r);
  384|       |
  385|       |      // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain
  386|       |      // some uninitialised data in their upper words. We use a mask to
  387|       |      // ensure that these bits are set to 0.
  388|  75.6M|      int idx = AOMMIN(8, width + 1 - j);
  ------------------
  |  |   34|  75.6M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 67.8M, False: 7.83M]
  |  |  ------------------
  ------------------
  389|  75.6M|      assert(idx >= 1);
  390|       |
  391|  75.7M|      if (idx < 8) {
  ------------------
  |  Branch (391:11): [True: 11.8M, False: 63.8M]
  ------------------
  392|  11.8M|        sum1 = _mm256_and_si256(mask[idx], sum1);
  393|  11.8M|        sum2 = _mm256_and_si256(mask[idx], sum2);
  394|  11.8M|      }
  395|       |
  396|  75.7M|      const __m256i p = compute_p(sum1, sum2, bit_depth, n);
  397|       |
  398|  75.7M|      const __m256i z = _mm256_min_epi32(
  399|  75.7M|          _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
  400|  75.7M|                            SGRPROJ_MTABLE_BITS),
  ------------------
  |  |  117|  75.7M|#define SGRPROJ_MTABLE_BITS 20
  ------------------
  401|  75.7M|          _mm256_set1_epi32(255));
  402|       |
  403|  75.7M|      const __m256i a_res = _mm256_i32gather_epi32(av1_x_by_xplus1, z, 4);
  404|       |
  405|  75.7M|      yy_storeu_256(A + i * buf_stride + j, a_res);
  406|       |
  407|  75.7M|      const __m256i a_complement =
  408|  75.7M|          _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
  ------------------
  |  |  104|  75.7M|#define SGRPROJ_SGR (1 << SGRPROJ_SGR_BITS)
  |  |  ------------------
  |  |  |  |  103|  75.7M|#define SGRPROJ_SGR_BITS 8
  |  |  ------------------
  ------------------
  409|       |
  410|       |      // sum1 might have lanes greater than 2^15, so we can't use madd to do
  411|       |      // multiplication involving sum1. However, a_complement and one_over_n
  412|       |      // are both less than 256, so we can multiply them first.
  413|  75.7M|      const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
  414|  75.7M|      const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
  415|  75.7M|      const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
  416|  75.7M|                                              SGRPROJ_RECIP_BITS);
  ------------------
  |  |  118|  75.7M|#define SGRPROJ_RECIP_BITS 12
  ------------------
  417|       |
  418|  75.7M|      yy_storeu_256(B + i * buf_stride + j, b_res);
  419|  75.7M|    }
  420|  11.6M|  }
  421|   471k|}
selfguided_avx2.c:boxsum_from_ii:
  180|   285M|static inline __m256i boxsum_from_ii(const int32_t *ii, int stride, int r) {
  181|   285M|  const __m256i tl = yy_loadu_256(ii - (r + 1) - (r + 1) * stride);
  182|   285M|  const __m256i tr = yy_loadu_256(ii + (r + 0) - (r + 1) * stride);
  183|   285M|  const __m256i bl = yy_loadu_256(ii - (r + 1) + r * stride);
  184|   285M|  const __m256i br = yy_loadu_256(ii + (r + 0) + r * stride);
  185|   285M|  const __m256i u = _mm256_sub_epi32(tr, tl);
  186|   285M|  const __m256i v = _mm256_sub_epi32(br, bl);
  187|   285M|  return _mm256_sub_epi32(v, u);
  188|   285M|}
selfguided_avx2.c:compute_p:
  194|   142M|static __m256i compute_p(__m256i sum1, __m256i sum2, int bit_depth, int n) {
  195|   142M|  __m256i an, bb;
  196|   142M|  if (bit_depth > 8) {
  ------------------
  |  Branch (196:7): [True: 73.8M, False: 69.1M]
  ------------------
  197|  73.8M|    const __m256i rounding_a = round_for_shift(2 * (bit_depth - 8));
  198|  73.8M|    const __m256i rounding_b = round_for_shift(bit_depth - 8);
  199|  73.8M|    const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8));
  200|  73.8M|    const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8);
  201|  73.8M|    const __m256i a =
  202|  73.8M|        _mm256_srl_epi32(_mm256_add_epi32(sum2, rounding_a), shift_a);
  203|  73.8M|    const __m256i b =
  204|  73.8M|        _mm256_srl_epi32(_mm256_add_epi32(sum1, rounding_b), shift_b);
  205|       |    // b < 2^14, so we can use a 16-bit madd rather than a 32-bit
  206|       |    // mullo to square it
  207|  73.8M|    bb = _mm256_madd_epi16(b, b);
  208|  73.8M|    an = _mm256_max_epi32(_mm256_mullo_epi32(a, _mm256_set1_epi32(n)), bb);
  209|  73.8M|  } else {
  210|  69.1M|    bb = _mm256_madd_epi16(sum1, sum1);
  211|  69.1M|    an = _mm256_mullo_epi32(sum2, _mm256_set1_epi32(n));
  212|  69.1M|  }
  213|   142M|  return _mm256_sub_epi32(an, bb);
  214|   142M|}
selfguided_avx2.c:final_filter_fast:
  496|   469k|                              int height, int highbd) {
  497|   469k|  const int nb0 = 5;
  498|   469k|  const int nb1 = 4;
  499|       |
  500|   469k|  const __m256i rounding0 =
  501|   469k|      round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
  ------------------
  |  |  103|   469k|#define SGRPROJ_SGR_BITS 8
  ------------------
                    round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
  ------------------
  |  |  101|   469k|#define SGRPROJ_RST_BITS 4
  ------------------
  502|   469k|  const __m256i rounding1 =
  503|   469k|      round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
  ------------------
  |  |  103|   469k|#define SGRPROJ_SGR_BITS 8
  ------------------
                    round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
  ------------------
  |  |  101|   469k|#define SGRPROJ_RST_BITS 4
  ------------------
  504|       |
  505|   469k|  const uint8_t *dgd_real =
  506|   469k|      highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
  ------------------
  |  |   75|   262k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  |  Branch (506:7): [True: 262k, False: 206k]
  ------------------
  507|       |
  508|  24.7M|  for (int i = 0; i < height; ++i) {
  ------------------
  |  Branch (508:19): [True: 24.2M, False: 469k]
  ------------------
  509|  24.2M|    if (!(i & 1)) {  // even row
  ------------------
  |  Branch (509:9): [True: 12.3M, False: 11.9M]
  ------------------
  510|   103M|      for (int j = 0; j < width; j += 8) {
  ------------------
  |  Branch (510:23): [True: 91.3M, False: 12.3M]
  ------------------
  511|  91.3M|        const __m256i a =
  512|  91.3M|            cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride);
  513|  91.3M|        const __m256i b =
  514|  91.3M|            cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride);
  515|       |
  516|  91.3M|        const __m128i raw =
  517|  91.3M|            xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
  518|  91.3M|        const __m256i src =
  519|  91.3M|            highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
  ------------------
  |  Branch (519:13): [True: 49.7M, False: 41.5M]
  ------------------
  520|       |
  521|  91.3M|        __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
  522|  91.3M|        __m256i w =
  523|  91.3M|            _mm256_srai_epi32(_mm256_add_epi32(v, rounding0),
  524|  91.3M|                              SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
  ------------------
  |  |  103|  91.3M|#define SGRPROJ_SGR_BITS 8
  ------------------
                                            SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
  ------------------
  |  |  101|  91.3M|#define SGRPROJ_RST_BITS 4
  ------------------
  525|       |
  526|  91.3M|        yy_storeu_256(dst + i * dst_stride + j, w);
  527|  91.3M|      }
  528|  12.3M|    } else {  // odd row
  529|   103M|      for (int j = 0; j < width; j += 8) {
  ------------------
  |  Branch (529:23): [True: 91.1M, False: 11.9M]
  ------------------
  530|  91.1M|        const __m256i a = cross_sum_fast_odd_row(A + i * buf_stride + j);
  531|  91.1M|        const __m256i b = cross_sum_fast_odd_row(B + i * buf_stride + j);
  532|       |
  533|  91.1M|        const __m128i raw =
  534|  91.1M|            xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
  535|  91.1M|        const __m256i src =
  536|  91.1M|            highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
  ------------------
  |  Branch (536:13): [True: 49.6M, False: 41.5M]
  ------------------
  537|       |
  538|  91.1M|        __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
  539|  91.1M|        __m256i w =
  540|  91.1M|            _mm256_srai_epi32(_mm256_add_epi32(v, rounding1),
  541|  91.1M|                              SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
  ------------------
  |  |  103|  91.1M|#define SGRPROJ_SGR_BITS 8
  ------------------
                                            SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
  ------------------
  |  |  101|  91.1M|#define SGRPROJ_RST_BITS 4
  ------------------
  542|       |
  543|  91.1M|        yy_storeu_256(dst + i * dst_stride + j, w);
  544|  91.1M|      }
  545|  11.9M|    }
  546|  24.2M|  }
  547|   469k|}
selfguided_avx2.c:cross_sum_fast_even_row:
  440|   181M|static inline __m256i cross_sum_fast_even_row(const int32_t *buf, int stride) {
  441|   181M|  const __m256i xtl = yy_loadu_256(buf - 1 - stride);
  442|   181M|  const __m256i xt = yy_loadu_256(buf - stride);
  443|   181M|  const __m256i xtr = yy_loadu_256(buf + 1 - stride);
  444|   181M|  const __m256i xbl = yy_loadu_256(buf - 1 + stride);
  445|   181M|  const __m256i xb = yy_loadu_256(buf + stride);
  446|   181M|  const __m256i xbr = yy_loadu_256(buf + 1 + stride);
  447|       |
  448|   181M|  const __m256i fives =
  449|   181M|      _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl)));
  450|   181M|  const __m256i sixes = _mm256_add_epi32(xt, xb);
  451|   181M|  const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes);
  452|       |
  453|   181M|  return _mm256_add_epi32(
  454|   181M|      _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2),
  455|   181M|                       fives_plus_sixes),
  456|   181M|      sixes);
  457|   181M|}
selfguided_avx2.c:cross_sum_fast_odd_row:
  474|   179M|static inline __m256i cross_sum_fast_odd_row(const int32_t *buf) {
  475|   179M|  const __m256i xl = yy_loadu_256(buf - 1);
  476|   179M|  const __m256i x = yy_loadu_256(buf);
  477|   179M|  const __m256i xr = yy_loadu_256(buf + 1);
  478|       |
  479|   179M|  const __m256i fives = _mm256_add_epi32(xl, xr);
  480|   179M|  const __m256i sixes = x;
  481|       |
  482|   179M|  const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes);
  483|       |
  484|   179M|  return _mm256_add_epi32(
  485|   179M|      _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2),
  486|   179M|                       fives_plus_sixes),
  487|   179M|      sixes);
  488|   179M|}
selfguided_avx2.c:calc_ab:
  221|   520k|                    int sgr_params_idx, int radius_idx) {
  222|   520k|  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
  223|   520k|  const int r = params->r[radius_idx];
  224|   520k|  const int n = (2 * r + 1) * (2 * r + 1);
  225|   520k|  const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
  226|       |  // one_over_n[n-1] is 2^12/n, so easily fits in an int16
  227|   520k|  const __m256i one_over_n = _mm256_set1_epi32(av1_one_by_x[n - 1]);
  228|       |
  229|   520k|  const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
  ------------------
  |  |  117|   520k|#define SGRPROJ_MTABLE_BITS 20
  ------------------
  230|   520k|  const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
  ------------------
  |  |  118|   520k|#define SGRPROJ_RECIP_BITS 12
  ------------------
  231|       |
  232|       |  // Set up masks
  233|   520k|  const __m128i ones32 = _mm_set_epi32(0, 0, ~0, ~0);
  234|   520k|  __m256i mask[8];
  235|  4.68M|  for (int idx = 0; idx < 8; idx++) {
  ------------------
  |  Branch (235:21): [True: 4.16M, False: 520k]
  ------------------
  236|  4.16M|    const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx));
  237|  4.16M|    mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
  238|  4.16M|  }
  239|       |
  240|  21.9M|  for (int i = -1; i < height + 1; ++i) {
  ------------------
  |  Branch (240:20): [True: 21.5M, False: 427k]
  ------------------
  241|   139M|    for (int j = -1; j < width + 1; j += 8) {
  ------------------
  |  Branch (241:22): [True: 118M, False: 21.4M]
  ------------------
  242|   118M|      const int32_t *Cij = C + i * buf_stride + j;
  243|   118M|      const int32_t *Dij = D + i * buf_stride + j;
  244|       |
  245|   118M|      __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r);
  246|   118M|      __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r);
  247|       |
  248|       |      // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain
  249|       |      // some uninitialised data in their upper words. We use a mask to
  250|       |      // ensure that these bits are set to 0.
  251|   118M|      int idx = AOMMIN(8, width + 1 - j);
  ------------------
  |  |   34|   118M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 105M, False: 12.2M]
  |  |  ------------------
  ------------------
  252|   118M|      assert(idx >= 1);
  253|       |
  254|   118M|      if (idx < 8) {
  ------------------
  |  Branch (254:11): [True: 22.2M, False: 95.8M]
  ------------------
  255|  22.2M|        sum1 = _mm256_and_si256(mask[idx], sum1);
  256|  22.2M|        sum2 = _mm256_and_si256(mask[idx], sum2);
  257|  22.2M|      }
  258|       |
  259|   118M|      const __m256i p = compute_p(sum1, sum2, bit_depth, n);
  260|       |
  261|   118M|      const __m256i z = _mm256_min_epi32(
  262|   118M|          _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
  263|   118M|                            SGRPROJ_MTABLE_BITS),
  ------------------
  |  |  117|   118M|#define SGRPROJ_MTABLE_BITS 20
  ------------------
  264|   118M|          _mm256_set1_epi32(255));
  265|       |
  266|   118M|      const __m256i a_res = _mm256_i32gather_epi32(av1_x_by_xplus1, z, 4);
  267|       |
  268|   118M|      yy_storeu_256(A + i * buf_stride + j, a_res);
  269|       |
  270|   118M|      const __m256i a_complement =
  271|   118M|          _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
  ------------------
  |  |  104|   118M|#define SGRPROJ_SGR (1 << SGRPROJ_SGR_BITS)
  |  |  ------------------
  |  |  |  |  103|   118M|#define SGRPROJ_SGR_BITS 8
  |  |  ------------------
  ------------------
  272|       |
  273|       |      // sum1 might have lanes greater than 2^15, so we can't use madd to do
  274|       |      // multiplication involving sum1. However, a_complement and one_over_n
  275|       |      // are both less than 256, so we can multiply them first.
  276|   118M|      const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
  277|   118M|      const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
  278|   118M|      const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
  279|   118M|                                              SGRPROJ_RECIP_BITS);
  ------------------
  |  |  118|   118M|#define SGRPROJ_RECIP_BITS 12
  ------------------
  280|       |
  281|   118M|      yy_storeu_256(B + i * buf_stride + j, b_res);
  282|   118M|    }
  283|  21.5M|  }
  284|   520k|}
selfguided_avx2.c:final_filter:
  326|   518k|                         int dgd_stride, int width, int height, int highbd) {
  327|   518k|  const int nb = 5;
  328|   518k|  const __m256i rounding =
  329|   518k|      round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
  ------------------
  |  |  103|   518k|#define SGRPROJ_SGR_BITS 8
  ------------------
                    round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
  ------------------
  |  |  101|   518k|#define SGRPROJ_RST_BITS 4
  ------------------
  330|   518k|  const uint8_t *dgd_real =
  331|   518k|      highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
  ------------------
  |  |   75|   264k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  |  Branch (331:7): [True: 264k, False: 254k]
  ------------------
  332|       |
  333|  26.6M|  for (int i = 0; i < height; ++i) {
  ------------------
  |  Branch (333:19): [True: 26.1M, False: 518k]
  ------------------
  334|   217M|    for (int j = 0; j < width; j += 8) {
  ------------------
  |  Branch (334:21): [True: 190M, False: 26.1M]
  ------------------
  335|   190M|      const __m256i a = cross_sum(A + i * buf_stride + j, buf_stride);
  336|   190M|      const __m256i b = cross_sum(B + i * buf_stride + j, buf_stride);
  337|       |
  338|   190M|      const __m128i raw =
  339|   190M|          xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
  340|   190M|      const __m256i src =
  341|   190M|          highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
  ------------------
  |  Branch (341:11): [True: 99.8M, False: 91.0M]
  ------------------
  342|       |
  343|   190M|      __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
  344|   190M|      __m256i w = _mm256_srai_epi32(_mm256_add_epi32(v, rounding),
  345|   190M|                                    SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
  ------------------
  |  |  103|   190M|#define SGRPROJ_SGR_BITS 8
  ------------------
                                                  SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
  ------------------
  |  |  101|   190M|#define SGRPROJ_RST_BITS 4
  ------------------
  346|       |
  347|   190M|      yy_storeu_256(dst + i * dst_stride + j, w);
  348|   190M|    }
  349|  26.1M|  }
  350|   518k|}
selfguided_avx2.c:cross_sum:
  302|   375M|static inline __m256i cross_sum(const int32_t *buf, int stride) {
  303|   375M|  const __m256i xtl = yy_loadu_256(buf - 1 - stride);
  304|   375M|  const __m256i xt = yy_loadu_256(buf - stride);
  305|   375M|  const __m256i xtr = yy_loadu_256(buf + 1 - stride);
  306|   375M|  const __m256i xl = yy_loadu_256(buf - 1);
  307|   375M|  const __m256i x = yy_loadu_256(buf);
  308|   375M|  const __m256i xr = yy_loadu_256(buf + 1);
  309|   375M|  const __m256i xbl = yy_loadu_256(buf - 1 + stride);
  310|   375M|  const __m256i xb = yy_loadu_256(buf + stride);
  311|   375M|  const __m256i xbr = yy_loadu_256(buf + 1 + stride);
  312|       |
  313|   375M|  const __m256i fours = _mm256_add_epi32(
  314|   375M|      xl, _mm256_add_epi32(xt, _mm256_add_epi32(xr, _mm256_add_epi32(xb, x))));
  315|   375M|  const __m256i threes =
  316|   375M|      _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl)));
  317|       |
  318|   375M|  return _mm256_sub_epi32(_mm256_slli_epi32(_mm256_add_epi32(fours, threes), 2),
  319|   375M|                          threes);
  320|   375M|}
selfguided_avx2.c:round_for_shift:
  190|   179M|static __m256i round_for_shift(unsigned shift) {
  191|   179M|  return _mm256_set1_epi32((1 << shift) >> 1);
  192|   179M|}

av1_warp_affine_avx2:
 1030|   238k|                          int16_t beta, int16_t gamma, int16_t delta) {
 1031|   238k|  __m256i horz_out[8];
 1032|   238k|  int i, j, k;
 1033|   238k|  const int bd = 8;
 1034|   238k|  const int reduce_bits_horiz = conv_params->round_0;
 1035|   238k|  const int reduce_bits_vert = conv_params->is_compound
  ------------------
  |  Branch (1035:32): [True: 7.75k, False: 230k]
  ------------------
 1036|   238k|                                   ? conv_params->round_1
 1037|   238k|                                   : 2 * FILTER_BITS - reduce_bits_horiz;
  ------------------
  |  |   21|   230k|#define FILTER_BITS 7
  ------------------
 1038|   238k|  const int offset_bits_horiz = bd + FILTER_BITS - 1;
  ------------------
  |  |   21|   238k|#define FILTER_BITS 7
  ------------------
 1039|   238k|  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
 1040|       |
 1041|   238k|  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
  ------------------
  |  |   21|   238k|#define FILTER_BITS 7
  ------------------
 1042|   238k|  const __m256i reduce_bits_vert_const =
 1043|   238k|      _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1));
 1044|   238k|  const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert);
 1045|   238k|  const int round_bits =
 1046|   238k|      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  ------------------
  |  |   21|   238k|#define FILTER_BITS 7
  ------------------
 1047|   238k|  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
  ------------------
  |  |   21|   238k|#define FILTER_BITS 7
  ------------------
 1048|   238k|  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
 1049|       |
 1050|   238k|  const __m256i round_const = _mm256_set1_epi16(
 1051|   238k|      (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1));
 1052|   238k|  const __m128i shift = _mm_cvtsi32_si128(reduce_bits_horiz);
 1053|       |
 1054|   238k|  __m256i res_sub_const, round_bits_const, wt;
 1055|   238k|  unpack_weights_and_set_round_const_avx2(conv_params, round_bits, offset_bits,
 1056|   238k|                                          &res_sub_const, &round_bits_const,
 1057|   238k|                                          &wt);
 1058|       |
 1059|   238k|  __m256i res_add_const_1;
 1060|   238k|  if (conv_params->is_compound == 1) {
  ------------------
  |  Branch (1060:7): [True: 7.74k, False: 230k]
  ------------------
 1061|  7.74k|    res_add_const_1 = _mm256_add_epi32(reduce_bits_vert_const, res_add_const);
 1062|   230k|  } else {
 1063|   230k|    res_add_const_1 = _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
 1064|   230k|                                        ((1 << reduce_bits_vert) >> 1));
 1065|   230k|  }
 1066|   238k|  const int32_t const1 = alpha * (-4) + beta * (-4) +
 1067|   238k|                         (1 << (WARPEDDIFF_PREC_BITS - 1)) +
  ------------------
  |  |  107|   238k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   238k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   238k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
 1068|   238k|                         (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
  ------------------
  |  |  103|   238k|#define WARPEDPIXEL_PREC_SHIFTS (1 << WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   238k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
                                       (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
  ------------------
  |  |  107|   238k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   238k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   238k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
 1069|   238k|  const int32_t const2 = gamma * (-4) + delta * (-4) +
 1070|   238k|                         (1 << (WARPEDDIFF_PREC_BITS - 1)) +
  ------------------
  |  |  107|   238k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   238k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   238k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
 1071|   238k|                         (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
  ------------------
  |  |  103|   238k|#define WARPEDPIXEL_PREC_SHIFTS (1 << WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   238k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
                                       (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
  ------------------
  |  |  107|   238k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   238k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   238k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
 1072|   238k|  const int32_t const3 = ((1 << WARP_PARAM_REDUCE_BITS) - 1);
  ------------------
  |  |  105|   238k|#define WARP_PARAM_REDUCE_BITS 6
  ------------------
 1073|   238k|  const int16_t const4 = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1));
  ------------------
  |  |   21|   238k|#define FILTER_BITS 7
  ------------------
 1074|   238k|  const int16_t const5 = (1 << (FILTER_BITS - reduce_bits_horiz));
  ------------------
  |  |   21|   238k|#define FILTER_BITS 7
  ------------------
 1075|       |
 1076|   238k|  __m256i shuffle_src[4];
 1077|   238k|  shuffle_src[0] = _mm256_load_si256((__m256i *)shuffle_src0);
 1078|   238k|  shuffle_src[1] = _mm256_load_si256((__m256i *)shuffle_src1);
 1079|   238k|  shuffle_src[2] = _mm256_load_si256((__m256i *)shuffle_src2);
 1080|   238k|  shuffle_src[3] = _mm256_load_si256((__m256i *)shuffle_src3);
 1081|       |
 1082|   765k|  for (i = 0; i < p_height; i += 8) {
  ------------------
  |  Branch (1082:15): [True: 526k, False: 238k]
  ------------------
 1083|  2.55M|    for (j = 0; j < p_width; j += 8) {
  ------------------
  |  Branch (1083:17): [True: 2.02M, False: 526k]
  ------------------
 1084|  2.02M|      const int32_t src_x = (p_col + j + 4) << subsampling_x;
 1085|  2.02M|      const int32_t src_y = (p_row + i + 4) << subsampling_y;
 1086|  2.02M|      const int64_t dst_x =
 1087|  2.02M|          (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
 1088|  2.02M|      const int64_t dst_y =
 1089|  2.02M|          (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
 1090|  2.02M|      const int64_t x4 = dst_x >> subsampling_x;
 1091|  2.02M|      const int64_t y4 = dst_y >> subsampling_y;
 1092|       |
 1093|  2.02M|      int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
  ------------------
  |  |   96|  2.02M|#define WARPEDMODEL_PREC_BITS 16
  ------------------
 1094|  2.02M|      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
  ------------------
  |  |   96|  2.02M|#define WARPEDMODEL_PREC_BITS 16
  ------------------
 1095|  2.02M|      int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
  ------------------
  |  |   96|  2.02M|#define WARPEDMODEL_PREC_BITS 16
  ------------------
 1096|  2.02M|      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
  ------------------
  |  |   96|  2.02M|#define WARPEDMODEL_PREC_BITS 16
  ------------------
 1097|       |
 1098|       |      // Add in all the constant terms, including rounding and offset
 1099|  2.02M|      sx4 += const1;
 1100|  2.02M|      sy4 += const2;
 1101|       |
 1102|  2.02M|      sx4 &= ~const3;
 1103|  2.02M|      sy4 &= ~const3;
 1104|       |
 1105|       |      // Horizontal filter
 1106|       |      // If the block is aligned such that, after clamping, every sample
 1107|       |      // would be taken from the leftmost/rightmost column, then we can
 1108|       |      // skip the expensive horizontal filter.
 1109|       |
 1110|  2.02M|      if (ix4 <= -7) {
  ------------------
  |  Branch (1110:11): [True: 121k, False: 1.90M]
  ------------------
 1111|   121k|        int iy, row = 0;
 1112|   971k|        for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
  ------------------
  |  |   34|   971k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 871k, False: 99.8k]
  |  |  ------------------
  ------------------
  |  Branch (1112:22): [True: 849k, False: 121k]
  ------------------
 1113|   849k|          iy = iy4 + k;
 1114|   849k|          iy = clamp(iy, 0, height - 1);
 1115|   849k|          const __m256i temp_0 =
 1116|   849k|              _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
 1117|   849k|          iy = iy4 + k + 1;
 1118|   849k|          iy = clamp(iy, 0, height - 1);
 1119|   849k|          const __m256i temp_1 =
 1120|   849k|              _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
 1121|   849k|          horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0);
 1122|   849k|          row += 1;
 1123|   849k|        }
 1124|   121k|        iy = iy4 + k;
 1125|   121k|        iy = clamp(iy, 0, height - 1);
 1126|   121k|        horz_out[row] = _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
 1127|  1.90M|      } else if (ix4 >= width + 6) {
  ------------------
  |  Branch (1127:18): [True: 130k, False: 1.77M]
  ------------------
 1128|   130k|        int iy, row = 0;
 1129|  1.04M|        for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
  ------------------
  |  |   34|  1.04M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 922k, False: 120k]
  |  |  ------------------
  ------------------
  |  Branch (1129:22): [True: 912k, False: 130k]
  ------------------
 1130|   912k|          iy = iy4 + k;
 1131|   912k|          iy = clamp(iy, 0, height - 1);
 1132|   912k|          const __m256i temp_0 = _mm256_set1_epi16(
 1133|   912k|              const4 + ref[iy * stride + (width - 1)] * const5);
 1134|   912k|          iy = iy4 + k + 1;
 1135|   912k|          iy = clamp(iy, 0, height - 1);
 1136|   912k|          const __m256i temp_1 = _mm256_set1_epi16(
 1137|   912k|              const4 + ref[iy * stride + (width - 1)] * const5);
 1138|   912k|          horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0);
 1139|   912k|          row += 1;
 1140|   912k|        }
 1141|   130k|        iy = iy4 + k;
 1142|   130k|        iy = clamp(iy, 0, height - 1);
 1143|   130k|        horz_out[row] =
 1144|   130k|            _mm256_set1_epi16(const4 + ref[iy * stride + (width - 1)] * const5);
 1145|  1.77M|      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
  ------------------
  |  Branch (1145:18): [True: 55.6k, False: 1.71M]
  |  Branch (1145:37): [True: 22.1k, False: 1.69M]
  ------------------
 1146|  78.8k|        const int out_of_boundary_left = -(ix4 - 6);
 1147|  78.8k|        const int out_of_boundary_right = (ix4 + 8) - width;
 1148|  78.8k|        int iy, sx, row = 0;
 1149|   630k|        for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
  ------------------
  |  |   34|   630k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 520k, False: 110k]
  |  |  ------------------
  ------------------
  |  Branch (1149:22): [True: 552k, False: 78.8k]
  ------------------
 1150|   552k|          iy = iy4 + k;
 1151|   552k|          iy = clamp(iy, 0, height - 1);
 1152|   552k|          __m128i src0 =
 1153|   552k|              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
 1154|   552k|          iy = iy4 + k + 1;
 1155|   552k|          iy = clamp(iy, 0, height - 1);
 1156|   552k|          __m128i src1 =
 1157|   552k|              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
 1158|       |
 1159|   552k|          if (out_of_boundary_left >= 0) {
  ------------------
  |  Branch (1159:15): [True: 396k, False: 155k]
  ------------------
 1160|   396k|            const __m128i shuffle_reg_left =
 1161|   396k|                _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
 1162|   396k|            src0 = _mm_shuffle_epi8(src0, shuffle_reg_left);
 1163|   396k|            src1 = _mm_shuffle_epi8(src1, shuffle_reg_left);
 1164|   396k|          }
 1165|   552k|          if (out_of_boundary_right >= 0) {
  ------------------
  |  Branch (1165:15): [True: 323k, False: 228k]
  ------------------
 1166|   323k|            const __m128i shuffle_reg_right = _mm_loadu_si128(
 1167|   323k|                (__m128i *)warp_pad_right[out_of_boundary_right]);
 1168|   323k|            src0 = _mm_shuffle_epi8(src0, shuffle_reg_right);
 1169|   323k|            src1 = _mm_shuffle_epi8(src1, shuffle_reg_right);
 1170|   323k|          }
 1171|   552k|          sx = sx4 + beta * (k + 4);
 1172|   552k|          const __m256i src_01 =
 1173|   552k|              _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1);
 1174|   552k|          horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row,
 1175|   552k|                                 shuffle_src, &round_const, &shift);
 1176|   552k|          row += 1;
 1177|   552k|        }
 1178|  78.8k|        iy = iy4 + k;
 1179|  78.8k|        iy = clamp(iy, 0, height - 1);
 1180|  78.8k|        __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
 1181|  78.8k|        if (out_of_boundary_left >= 0) {
  ------------------
  |  Branch (1181:13): [True: 56.6k, False: 22.2k]
  ------------------
 1182|  56.6k|          const __m128i shuffle_reg_left =
 1183|  56.6k|              _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
 1184|  56.6k|          src = _mm_shuffle_epi8(src, shuffle_reg_left);
 1185|  56.6k|        }
 1186|  78.8k|        if (out_of_boundary_right >= 0) {
  ------------------
  |  Branch (1186:13): [True: 46.2k, False: 32.6k]
  ------------------
 1187|  46.2k|          const __m128i shuffle_reg_right =
 1188|  46.2k|              _mm_loadu_si128((__m128i *)warp_pad_right[out_of_boundary_right]);
 1189|  46.2k|          src = _mm_shuffle_epi8(src, shuffle_reg_right);
 1190|  46.2k|        }
 1191|  78.8k|        sx = sx4 + beta * (k + 4);
 1192|  78.8k|        const __m256i src_01 = _mm256_castsi128_si256(src);
 1193|  78.8k|        __m256i coeff[4];
 1194|  78.8k|        prepare_horizontal_filter_coeff(alpha, sx, coeff);
 1195|  78.8k|        filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src,
 1196|  78.8k|                               &round_const, &shift, row);
 1197|  1.69M|      } else {
 1198|  1.69M|        prepare_warp_horizontal_filter_avx2(
 1199|  1.69M|            ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height,
 1200|  1.69M|            i, &round_const, &shift, shuffle_src);
 1201|  1.69M|      }
 1202|       |
 1203|       |      // Vertical filter
 1204|  2.02M|      prepare_warp_vertical_filter_avx2(
 1205|  2.02M|          pred, horz_out, conv_params, gamma, delta, p_height, p_stride,
 1206|  2.02M|          p_width, i, j, sy4, reduce_bits_vert, &res_add_const_1, round_bits,
 1207|  2.02M|          &res_sub_const, &round_bits_const, &wt);
 1208|  2.02M|    }
 1209|   526k|  }
 1210|   238k|}
warp_plane_avx2.c:unpack_weights_and_set_round_const_avx2:
  433|   238k|    __m256i *res_sub_const, __m256i *round_bits_const, __m256i *wt) {
  434|   238k|  *res_sub_const =
  435|   238k|      _mm256_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
  436|   238k|                        (1 << (offset_bits - conv_params->round_1 - 1)));
  437|   238k|  *round_bits_const = _mm256_set1_epi16(((1 << round_bits) >> 1));
  438|       |
  439|   238k|  const int w0 = conv_params->fwd_offset;
  440|   238k|  const int w1 = conv_params->bck_offset;
  441|   238k|  const __m256i wt0 = _mm256_set1_epi16((short)w0);
  442|   238k|  const __m256i wt1 = _mm256_set1_epi16((short)w1);
  443|   238k|  *wt = _mm256_unpacklo_epi16(wt0, wt1);
  444|   238k|}
warp_plane_avx2.c:horizontal_filter_avx2:
  258|  3.87M|                                          const __m128i *shift) {
  259|  3.87M|  __m256i coeff[4];
  260|  3.87M|  prepare_horizontal_filter_coeff_avx2(alpha, beta, sx, coeff);
  261|  3.87M|  filter_src_pixels_avx2(src, horz_out, coeff, shuffle_src, round_const, shift,
  262|  3.87M|                         row);
  263|  3.87M|}
warp_plane_avx2.c:prepare_horizontal_filter_coeff_avx2:
  101|  3.87M|                                                        __m256i *coeff) {
  102|  3.87M|  __m128i tmp_0 = _mm_loadl_epi64(
  103|  3.87M|      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 0 * alpha)) >>
  104|  3.87M|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  3.87M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.87M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.87M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  105|  3.87M|  __m128i tmp_1 = _mm_loadl_epi64(
  106|  3.87M|      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 1 * alpha)) >>
  107|  3.87M|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  3.87M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.87M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.87M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  108|  3.87M|  __m128i tmp_2 = _mm_loadl_epi64(
  109|  3.87M|      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 2 * alpha)) >>
  110|  3.87M|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  3.87M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.87M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.87M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  111|  3.87M|  __m128i tmp_3 = _mm_loadl_epi64(
  112|  3.87M|      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 3 * alpha)) >>
  113|  3.87M|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  3.87M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.87M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.87M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  114|       |
  115|  3.87M|  __m128i tmp_4 = _mm_loadl_epi64(
  116|  3.87M|      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 4 * alpha)) >>
  117|  3.87M|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  3.87M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.87M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.87M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  118|  3.87M|  __m128i tmp_5 = _mm_loadl_epi64(
  119|  3.87M|      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 5 * alpha)) >>
  120|  3.87M|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  3.87M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.87M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.87M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  121|  3.87M|  __m128i tmp_6 = _mm_loadl_epi64(
  122|  3.87M|      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 6 * alpha)) >>
  123|  3.87M|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  3.87M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.87M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.87M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  124|  3.87M|  __m128i tmp_7 = _mm_loadl_epi64(
  125|  3.87M|      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 7 * alpha)) >>
  126|  3.87M|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  3.87M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.87M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.87M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  127|       |
  128|  3.87M|  __m256i tmp0_256 = _mm256_castsi128_si256(tmp_0);
  129|  3.87M|  __m256i tmp2_256 = _mm256_castsi128_si256(tmp_2);
  130|  3.87M|  __m256i tmp1_256 = _mm256_castsi128_si256(tmp_1);
  131|  3.87M|  __m256i tmp3_256 = _mm256_castsi128_si256(tmp_3);
  132|       |
  133|  3.87M|  __m256i tmp4_256 = _mm256_castsi128_si256(tmp_4);
  134|  3.87M|  __m256i tmp6_256 = _mm256_castsi128_si256(tmp_6);
  135|  3.87M|  __m256i tmp5_256 = _mm256_castsi128_si256(tmp_5);
  136|  3.87M|  __m256i tmp7_256 = _mm256_castsi128_si256(tmp_7);
  137|       |
  138|  3.87M|  __m128i tmp_8 = _mm_loadl_epi64(
  139|  3.87M|      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 0 * alpha) >>
  140|  3.87M|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  3.87M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.87M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.87M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  141|  3.87M|  tmp0_256 = _mm256_inserti128_si256(tmp0_256, tmp_8, 1);
  142|       |
  143|  3.87M|  __m128i tmp_9 = _mm_loadl_epi64(
  144|  3.87M|      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 1 * alpha) >>
  145|  3.87M|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  3.87M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.87M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.87M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  146|  3.87M|  tmp1_256 = _mm256_inserti128_si256(tmp1_256, tmp_9, 1);
  147|       |
  148|  3.87M|  __m128i tmp_10 = _mm_loadl_epi64(
  149|  3.87M|      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 2 * alpha) >>
  150|  3.87M|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  3.87M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.87M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.87M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  151|  3.87M|  tmp2_256 = _mm256_inserti128_si256(tmp2_256, tmp_10, 1);
  152|       |
  153|  3.87M|  __m128i tmp_11 = _mm_loadl_epi64(
  154|  3.87M|      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 3 * alpha) >>
  155|  3.87M|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  3.87M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.87M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.87M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  156|  3.87M|  tmp3_256 = _mm256_inserti128_si256(tmp3_256, tmp_11, 1);
  157|       |
  158|  3.87M|  tmp_2 = _mm_loadl_epi64(
  159|  3.87M|      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 4 * alpha) >>
  160|  3.87M|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  3.87M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.87M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.87M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  161|  3.87M|  tmp4_256 = _mm256_inserti128_si256(tmp4_256, tmp_2, 1);
  162|       |
  163|  3.87M|  tmp_3 = _mm_loadl_epi64(
  164|  3.87M|      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 5 * alpha) >>
  165|  3.87M|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  3.87M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.87M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.87M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  166|  3.87M|  tmp5_256 = _mm256_inserti128_si256(tmp5_256, tmp_3, 1);
  167|       |
  168|  3.87M|  tmp_6 = _mm_loadl_epi64(
  169|  3.87M|      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 6 * alpha) >>
  170|  3.87M|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  3.87M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.87M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.87M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  171|  3.87M|  tmp6_256 = _mm256_inserti128_si256(tmp6_256, tmp_6, 1);
  172|       |
  173|  3.87M|  tmp_7 = _mm_loadl_epi64(
  174|  3.87M|      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 7 * alpha) >>
  175|  3.87M|                                  WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  3.87M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.87M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.87M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  176|  3.87M|  tmp7_256 = _mm256_inserti128_si256(tmp7_256, tmp_7, 1);
  177|       |
  178|  3.87M|  const __m256i tmp_12 = _mm256_unpacklo_epi16(tmp0_256, tmp2_256);
  179|  3.87M|  const __m256i tmp_13 = _mm256_unpacklo_epi16(tmp1_256, tmp3_256);
  180|  3.87M|  const __m256i tmp_14 = _mm256_unpacklo_epi16(tmp4_256, tmp6_256);
  181|  3.87M|  const __m256i tmp_15 = _mm256_unpacklo_epi16(tmp5_256, tmp7_256);
  182|       |
  183|  3.87M|  const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14);
  184|  3.87M|  const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14);
  185|  3.87M|  const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15);
  186|  3.87M|  const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15);
  187|       |
  188|  3.87M|  coeff[0] = _mm256_unpacklo_epi64(res_0, res_2);
  189|  3.87M|  coeff[1] = _mm256_unpackhi_epi64(res_0, res_2);
  190|  3.87M|  coeff[2] = _mm256_unpacklo_epi64(res_1, res_3);
  191|  3.87M|  coeff[3] = _mm256_unpackhi_epi64(res_1, res_3);
  192|  3.87M|}
warp_plane_avx2.c:prepare_horizontal_filter_coeff:
  265|   558k|                                                   __m256i *coeff) {
  266|   558k|  const __m128i tmp_0 = _mm_loadl_epi64(
  267|   558k|      (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|   558k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   558k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   558k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  268|   558k|  const __m128i tmp_1 = _mm_loadl_epi64(
  269|   558k|      (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|   558k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   558k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   558k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  270|   558k|  const __m128i tmp_2 = _mm_loadl_epi64(
  271|   558k|      (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|   558k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   558k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   558k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  272|   558k|  const __m128i tmp_3 = _mm_loadl_epi64(
  273|   558k|      (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|   558k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   558k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   558k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  274|   558k|  const __m128i tmp_4 = _mm_loadl_epi64(
  275|   558k|      (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|   558k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   558k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   558k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  276|   558k|  const __m128i tmp_5 = _mm_loadl_epi64(
  277|   558k|      (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|   558k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   558k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   558k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  278|   558k|  const __m128i tmp_6 = _mm_loadl_epi64(
  279|   558k|      (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|   558k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   558k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   558k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  280|   558k|  const __m128i tmp_7 = _mm_loadl_epi64(
  281|   558k|      (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|   558k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   558k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   558k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  282|       |
  283|   558k|  const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
  284|   558k|  const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
  285|   558k|  const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
  286|   558k|  const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
  287|       |
  288|   558k|  const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
  289|   558k|  const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
  290|   558k|  const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
  291|   558k|  const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
  292|       |
  293|   558k|  coeff[0] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_12, tmp_14));
  294|   558k|  coeff[1] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_12, tmp_14));
  295|   558k|  coeff[2] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_13, tmp_15));
  296|   558k|  coeff[3] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_13, tmp_15));
  297|   558k|}
warp_plane_avx2.c:filter_src_pixels_avx2:
   81|  14.0M|                                          const __m128i *shift, int row) {
   82|  14.0M|  const __m256i src_0 = _mm256_shuffle_epi8(src, shuffle_src[0]);
   83|  14.0M|  const __m256i src_1 = _mm256_shuffle_epi8(src, shuffle_src[1]);
   84|  14.0M|  const __m256i src_2 = _mm256_shuffle_epi8(src, shuffle_src[2]);
   85|  14.0M|  const __m256i src_3 = _mm256_shuffle_epi8(src, shuffle_src[3]);
   86|       |
   87|  14.0M|  const __m256i res_02 = _mm256_maddubs_epi16(src_0, coeff[0]);
   88|  14.0M|  const __m256i res_46 = _mm256_maddubs_epi16(src_1, coeff[1]);
   89|  14.0M|  const __m256i res_13 = _mm256_maddubs_epi16(src_2, coeff[2]);
   90|  14.0M|  const __m256i res_57 = _mm256_maddubs_epi16(src_3, coeff[3]);
   91|       |
   92|  14.0M|  const __m256i res_even = _mm256_add_epi16(res_02, res_46);
   93|  14.0M|  const __m256i res_odd = _mm256_add_epi16(res_13, res_57);
   94|  14.0M|  const __m256i res =
   95|  14.0M|      _mm256_add_epi16(_mm256_add_epi16(res_even, res_odd), *round_const);
   96|  14.0M|  horz_out[row] = _mm256_srl_epi16(res, *shift);
   97|  14.0M|}
warp_plane_avx2.c:prepare_warp_horizontal_filter_avx2:
 1006|  1.69M|    const __m256i *shuffle_src) {
 1007|  1.69M|  if (alpha == 0 && beta == 0)
  ------------------
  |  Branch (1007:7): [True: 798k, False: 897k]
  |  Branch (1007:21): [True: 384k, False: 414k]
  ------------------
 1008|   384k|    warp_horizontal_filter_alpha0_beta0_avx2(
 1009|   384k|        ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
 1010|   384k|        round_const, shift, shuffle_src);
 1011|  1.31M|  else if (alpha == 0 && beta != 0)
  ------------------
  |  Branch (1011:12): [True: 414k, False: 897k]
  |  Branch (1011:26): [True: 414k, False: 0]
  ------------------
 1012|   414k|    warp_horizontal_filter_alpha0_avx2(ref, horz_out, stride, ix4, iy4, sx4,
 1013|   414k|                                       alpha, beta, p_height, height, i,
 1014|   414k|                                       round_const, shift, shuffle_src);
 1015|   897k|  else if (alpha != 0 && beta == 0)
  ------------------
  |  Branch (1015:12): [True: 897k, False: 18.4E]
  |  Branch (1015:26): [True: 416k, False: 480k]
  ------------------
 1016|   416k|    warp_horizontal_filter_beta0_avx2(ref, horz_out, stride, ix4, iy4, sx4,
 1017|   416k|                                      alpha, beta, p_height, height, i,
 1018|   416k|                                      round_const, shift, shuffle_src);
 1019|   480k|  else
 1020|   480k|    warp_horizontal_filter_avx2(ref, horz_out, stride, ix4, iy4, sx4, alpha,
 1021|   480k|                                beta, p_height, height, i, round_const, shift,
 1022|   480k|                                shuffle_src);
 1023|  1.69M|}
warp_plane_avx2.c:warp_horizontal_filter_alpha0_beta0_avx2:
  403|   383k|    const __m256i *shuffle_src) {
  404|   383k|  (void)alpha;
  405|   383k|  int k, iy, row = 0;
  406|   383k|  __m256i coeff[4];
  407|   383k|  prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx4, coeff);
  408|  3.02M|  for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
  ------------------
  |  |   34|  3.02M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 2.08M, False: 948k]
  |  |  ------------------
  ------------------
  |  Branch (408:16): [True: 2.64M, False: 383k]
  ------------------
  409|  2.64M|    iy = iy4 + k;
  410|  2.64M|    iy = clamp(iy, 0, height - 1);
  411|  2.64M|    const __m128i src0 =
  412|  2.64M|        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
  413|  2.64M|    iy = iy4 + k + 1;
  414|  2.64M|    iy = clamp(iy, 0, height - 1);
  415|  2.64M|    const __m128i src1 =
  416|  2.64M|        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
  417|  2.64M|    const __m256i src_01 =
  418|  2.64M|        _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1);
  419|  2.64M|    filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
  420|  2.64M|                           shift, row);
  421|  2.64M|    row += 1;
  422|  2.64M|  }
  423|   383k|  iy = iy4 + k;
  424|   383k|  iy = clamp(iy, 0, height - 1);
  425|   383k|  const __m256i src_01 = _mm256_castsi128_si256(
  426|   383k|      _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
  427|   383k|  filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
  428|   383k|                         shift, row);
  429|   383k|}
warp_plane_avx2.c:prepare_horizontal_filter_coeff_alpha0_avx2:
  235|  3.67M|                                                               __m256i *coeff) {
  236|  3.67M|  const __m128i tmp_0 =
  237|  3.67M|      _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  3.67M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.67M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.67M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  238|  3.67M|  const __m128i tmp_1 = _mm_loadl_epi64(
  239|  3.67M|      (__m128i *)&av1_filter_8bit[(sx + beta) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|  3.67M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.67M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.67M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  240|       |
  241|  3.67M|  const __m256i res_0 =
  242|  3.67M|      _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_0), tmp_1, 0x1);
  243|       |
  244|  3.67M|  coeff[0] = _mm256_shuffle_epi8(
  245|  3.67M|      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask01_avx2));
  246|  3.67M|  coeff[1] = _mm256_shuffle_epi8(
  247|  3.67M|      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask23_avx2));
  248|  3.67M|  coeff[2] = _mm256_shuffle_epi8(
  249|  3.67M|      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask45_avx2));
  250|  3.67M|  coeff[3] = _mm256_shuffle_epi8(
  251|  3.67M|      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask67_avx2));
  252|  3.67M|}
warp_plane_avx2.c:warp_horizontal_filter_alpha0_avx2:
  336|   414k|    const __m256i *shuffle_src) {
  337|   414k|  (void)alpha;
  338|   414k|  int k, iy, sx, row = 0;
  339|   414k|  __m256i coeff[4];
  340|  3.29M|  for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
  ------------------
  |  |   34|  3.29M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 2.91M, False: 384k]
  |  |  ------------------
  ------------------
  |  Branch (340:16): [True: 2.88M, False: 414k]
  ------------------
  341|  2.88M|    iy = iy4 + k;
  342|  2.88M|    iy = clamp(iy, 0, height - 1);
  343|  2.88M|    const __m128i src_0 =
  344|  2.88M|        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
  345|  2.88M|    iy = iy4 + k + 1;
  346|  2.88M|    iy = clamp(iy, 0, height - 1);
  347|  2.88M|    const __m128i src_1 =
  348|  2.88M|        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
  349|  2.88M|    const __m256i src_01 =
  350|  2.88M|        _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
  351|  2.88M|    sx = sx4 + beta * (k + 4);
  352|  2.88M|    prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff);
  353|  2.88M|    filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
  354|  2.88M|                           shift, row);
  355|  2.88M|    row += 1;
  356|  2.88M|  }
  357|   414k|  iy = iy4 + k;
  358|   414k|  iy = clamp(iy, 0, height - 1);
  359|   414k|  const __m256i src_01 = _mm256_castsi128_si256(
  360|   414k|      _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
  361|   414k|  sx = sx4 + beta * (k + 4);
  362|   414k|  prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff);
  363|   414k|  filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
  364|   414k|                         shift, row);
  365|   414k|}
warp_plane_avx2.c:warp_horizontal_filter_beta0_avx2:
  371|   416k|    const __m256i *shuffle_src) {
  372|   416k|  (void)beta;
  373|   416k|  int k, iy, row = 0;
  374|   416k|  __m256i coeff[4];
  375|   416k|  prepare_horizontal_filter_coeff_beta0_avx2(alpha, sx4, coeff);
  376|  3.32M|  for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
  ------------------
  |  |   34|  3.32M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 2.07M, False: 1.24M]
  |  |  ------------------
  ------------------
  |  Branch (376:16): [True: 2.91M, False: 416k]
  ------------------
  377|  2.91M|    iy = iy4 + k;
  378|  2.91M|    iy = clamp(iy, 0, height - 1);
  379|  2.91M|    const __m128i src_0 =
  380|  2.91M|        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
  381|  2.91M|    iy = iy4 + k + 1;
  382|  2.91M|    iy = clamp(iy, 0, height - 1);
  383|  2.91M|    const __m128i src_1 =
  384|  2.91M|        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
  385|  2.91M|    const __m256i src_01 =
  386|  2.91M|        _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
  387|  2.91M|    filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
  388|  2.91M|                           shift, row);
  389|  2.91M|    row += 1;
  390|  2.91M|  }
  391|   416k|  iy = iy4 + k;
  392|   416k|  iy = clamp(iy, 0, height - 1);
  393|   416k|  const __m256i src_01 = _mm256_castsi128_si256(
  394|   416k|      _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
  395|   416k|  filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
  396|   416k|                         shift, row);
  397|   416k|}
warp_plane_avx2.c:prepare_horizontal_filter_coeff_beta0_avx2:
  195|   416k|                                                              __m256i *coeff) {
  196|   416k|  __m128i tmp_0 = _mm_loadl_epi64(
  197|   416k|      (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|   416k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   416k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   416k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  198|   416k|  __m128i tmp_1 = _mm_loadl_epi64(
  199|   416k|      (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|   416k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   416k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   416k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  200|   416k|  __m128i tmp_2 = _mm_loadl_epi64(
  201|   416k|      (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|   416k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   416k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   416k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  202|   416k|  __m128i tmp_3 = _mm_loadl_epi64(
  203|   416k|      (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|   416k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   416k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   416k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  204|   416k|  __m128i tmp_4 = _mm_loadl_epi64(
  205|   416k|      (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|   416k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   416k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   416k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  206|   416k|  __m128i tmp_5 = _mm_loadl_epi64(
  207|   416k|      (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|   416k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   416k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   416k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  208|   416k|  __m128i tmp_6 = _mm_loadl_epi64(
  209|   416k|      (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|   416k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   416k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   416k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  210|   416k|  __m128i tmp_7 = _mm_loadl_epi64(
  211|   416k|      (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
  ------------------
  |  |  107|   416k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   416k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   416k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  212|       |
  213|   416k|  tmp_0 = _mm_unpacklo_epi16(tmp_0, tmp_2);
  214|   416k|  tmp_1 = _mm_unpacklo_epi16(tmp_1, tmp_3);
  215|   416k|  tmp_4 = _mm_unpacklo_epi16(tmp_4, tmp_6);
  216|   416k|  tmp_5 = _mm_unpacklo_epi16(tmp_5, tmp_7);
  217|       |
  218|   416k|  const __m256i tmp_12 = _mm256_broadcastsi128_si256(tmp_0);
  219|   416k|  const __m256i tmp_13 = _mm256_broadcastsi128_si256(tmp_1);
  220|   416k|  const __m256i tmp_14 = _mm256_broadcastsi128_si256(tmp_4);
  221|   416k|  const __m256i tmp_15 = _mm256_broadcastsi128_si256(tmp_5);
  222|       |
  223|   416k|  const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14);
  224|   416k|  const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14);
  225|   416k|  const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15);
  226|   416k|  const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15);
  227|       |
  228|   416k|  coeff[0] = _mm256_unpacklo_epi64(res_0, res_2);
  229|   416k|  coeff[1] = _mm256_unpackhi_epi64(res_0, res_2);
  230|   416k|  coeff[2] = _mm256_unpacklo_epi64(res_1, res_3);
  231|   416k|  coeff[3] = _mm256_unpackhi_epi64(res_1, res_3);
  232|   416k|}
warp_plane_avx2.c:warp_horizontal_filter_avx2:
  303|   480k|    const __m256i *shuffle_src) {
  304|   480k|  int k, iy, sx, row = 0;
  305|   480k|  __m256i coeff[4];
  306|  3.81M|  for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
  ------------------
  |  |   34|  3.81M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 2.53M, False: 1.27M]
  |  |  ------------------
  ------------------
  |  Branch (306:16): [True: 3.32M, False: 480k]
  ------------------
  307|  3.32M|    iy = iy4 + k;
  308|  3.32M|    iy = clamp(iy, 0, height - 1);
  309|  3.32M|    const __m128i src_0 =
  310|  3.32M|        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
  311|  3.32M|    iy = iy4 + k + 1;
  312|  3.32M|    iy = clamp(iy, 0, height - 1);
  313|  3.32M|    const __m128i src_1 =
  314|  3.32M|        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
  315|  3.32M|    const __m256i src_01 =
  316|  3.32M|        _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
  317|  3.32M|    sx = sx4 + beta * (k + 4);
  318|  3.32M|    horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row, shuffle_src,
  319|  3.32M|                           round_const, shift);
  320|  3.32M|    row += 1;
  321|  3.32M|  }
  322|   480k|  iy = iy4 + k;
  323|   480k|  iy = clamp(iy, 0, height - 1);
  324|   480k|  const __m256i src_01 = _mm256_castsi128_si256(
  325|   480k|      _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
  326|   480k|  sx = sx4 + beta * (k + 4);
  327|   480k|  prepare_horizontal_filter_coeff(alpha, sx, coeff);
  328|   480k|  filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
  329|   480k|                         shift, row);
  330|   480k|}
warp_plane_avx2.c:prepare_warp_vertical_filter_avx2:
  979|  2.01M|    const __m256i *wt) {
  980|  2.01M|  if (gamma == 0 && delta == 0)
  ------------------
  |  Branch (980:7): [True: 863k, False: 1.15M]
  |  Branch (980:21): [True: 412k, False: 450k]
  ------------------
  981|   412k|    warp_vertical_filter_gamma0_delta0_avx2(
  982|   412k|        pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
  983|   412k|        i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
  984|   412k|        round_bits_const, wt);
  985|  1.60M|  else if (gamma == 0 && delta != 0)
  ------------------
  |  Branch (985:12): [True: 450k, False: 1.14M]
  |  Branch (985:26): [True: 450k, False: 0]
  ------------------
  986|   450k|    warp_vertical_filter_gamma0_avx2(
  987|   450k|        pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
  988|   450k|        i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
  989|   450k|        round_bits_const, wt);
  990|  1.15M|  else if (gamma != 0 && delta == 0)
  ------------------
  |  Branch (990:12): [True: 1.15M, False: 18.4E]
  |  Branch (990:26): [True: 172k, False: 978k]
  ------------------
  991|   172k|    warp_vertical_filter_delta0_avx2(
  992|   172k|        pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
  993|   172k|        i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
  994|   172k|        round_bits_const, wt);
  995|   977k|  else
  996|   977k|    warp_vertical_filter_avx2(pred, horz_out, conv_params, gamma, delta,
  997|   977k|                              p_height, p_stride, p_width, i, j, sy4,
  998|   977k|                              reduce_bits_vert, res_add_const, round_bits,
  999|   977k|                              res_sub_const, round_bits_const, wt);
 1000|  2.01M|}
warp_plane_avx2.c:warp_vertical_filter_gamma0_delta0_avx2:
  931|   412k|    const __m256i *wt) {
  932|   412k|  (void)gamma;
  933|   412k|  int k, row = 0;
  934|   412k|  __m256i src[8], coeffs[8];
  935|   412k|  const __m256i src_0 = horz_out[0];
  936|   412k|  const __m256i src_1 =
  937|   412k|      _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
  938|   412k|  const __m256i src_2 = horz_out[1];
  939|   412k|  const __m256i src_3 =
  940|   412k|      _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
  941|   412k|  const __m256i src_4 = horz_out[2];
  942|   412k|  const __m256i src_5 =
  943|   412k|      _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
  944|       |
  945|   412k|  src[0] = _mm256_unpacklo_epi16(src_0, src_1);
  946|   412k|  src[2] = _mm256_unpacklo_epi16(src_2, src_3);
  947|   412k|  src[4] = _mm256_unpacklo_epi16(src_4, src_5);
  948|       |
  949|   412k|  src[1] = _mm256_unpackhi_epi16(src_0, src_1);
  950|   412k|  src[3] = _mm256_unpackhi_epi16(src_2, src_3);
  951|   412k|  src[5] = _mm256_unpackhi_epi16(src_4, src_5);
  952|       |
  953|   412k|  prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy4, coeffs);
  954|       |
  955|  2.07M|  for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
  ------------------
  |  |   34|  2.07M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.50M, False: 572k]
  |  |  ------------------
  ------------------
  |  Branch (955:16): [True: 1.66M, False: 412k]
  ------------------
  956|  1.66M|    __m256i res_lo, res_hi;
  957|  1.66M|    filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
  958|  1.66M|                                    row);
  959|  1.66M|    store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
  960|  1.66M|                                      res_sub_const, round_bits_const, pred,
  961|  1.66M|                                      conv_params, i, j, k, reduce_bits_vert,
  962|  1.66M|                                      p_stride, p_width, round_bits);
  963|  1.66M|    src[0] = src[2];
  964|  1.66M|    src[2] = src[4];
  965|  1.66M|    src[4] = src[6];
  966|  1.66M|    src[1] = src[3];
  967|  1.66M|    src[3] = src[5];
  968|  1.66M|    src[5] = src[7];
  969|  1.66M|    row += 1;
  970|  1.66M|  }
  971|   412k|}
warp_plane_avx2.c:prepare_vertical_filter_coeffs_gamma0_avx2:
  600|  2.21M|                                                              __m256i *coeffs) {
  601|  2.21M|  const __m128i filt_0 = _mm_loadu_si128(
  602|  2.21M|      (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  2.21M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  2.21M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  2.21M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  603|  2.21M|  const __m128i filt_1 = _mm_loadu_si128(
  604|  2.21M|      (__m128i *)(av1_warped_filter + ((sy + delta) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  2.21M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  2.21M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  2.21M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  605|       |
  606|  2.21M|  __m256i res_0 =
  607|  2.21M|      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_0), filt_1, 0x1);
  608|       |
  609|  2.21M|  coeffs[0] = _mm256_shuffle_epi8(
  610|  2.21M|      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask0_avx2));
  611|  2.21M|  coeffs[1] = _mm256_shuffle_epi8(
  612|  2.21M|      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask1_avx2));
  613|  2.21M|  coeffs[2] = _mm256_shuffle_epi8(
  614|  2.21M|      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask2_avx2));
  615|  2.21M|  coeffs[3] = _mm256_shuffle_epi8(
  616|  2.21M|      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask3_avx2));
  617|       |
  618|  2.21M|  coeffs[4] = coeffs[0];
  619|  2.21M|  coeffs[5] = coeffs[1];
  620|  2.21M|  coeffs[6] = coeffs[2];
  621|  2.21M|  coeffs[7] = coeffs[3];
  622|  2.21M|}
warp_plane_avx2.c:filter_src_pixels_vertical_avx2:
  628|  8.07M|                                                   __m256i *res_hi, int row) {
  629|  8.07M|  const __m256i src_6 = horz_out[row + 3];
  630|  8.07M|  const __m256i src_7 =
  631|  8.07M|      _mm256_permute2x128_si256(horz_out[row + 3], horz_out[row + 4], 0x21);
  632|       |
  633|  8.07M|  src[6] = _mm256_unpacklo_epi16(src_6, src_7);
  634|       |
  635|  8.07M|  const __m256i res_0 = _mm256_madd_epi16(src[0], coeffs[0]);
  636|  8.07M|  const __m256i res_2 = _mm256_madd_epi16(src[2], coeffs[1]);
  637|  8.07M|  const __m256i res_4 = _mm256_madd_epi16(src[4], coeffs[2]);
  638|  8.07M|  const __m256i res_6 = _mm256_madd_epi16(src[6], coeffs[3]);
  639|       |
  640|  8.07M|  const __m256i res_even = _mm256_add_epi32(_mm256_add_epi32(res_0, res_2),
  641|  8.07M|                                            _mm256_add_epi32(res_4, res_6));
  642|       |
  643|  8.07M|  src[7] = _mm256_unpackhi_epi16(src_6, src_7);
  644|       |
  645|  8.07M|  const __m256i res_1 = _mm256_madd_epi16(src[1], coeffs[4]);
  646|  8.07M|  const __m256i res_3 = _mm256_madd_epi16(src[3], coeffs[5]);
  647|  8.07M|  const __m256i res_5 = _mm256_madd_epi16(src[5], coeffs[6]);
  648|  8.07M|  const __m256i res_7 = _mm256_madd_epi16(src[7], coeffs[7]);
  649|       |
  650|  8.07M|  const __m256i res_odd = _mm256_add_epi32(_mm256_add_epi32(res_1, res_3),
  651|  8.07M|                                           _mm256_add_epi32(res_5, res_7));
  652|       |
  653|       |  // Rearrange pixels back into the order 0 ... 7
  654|  8.07M|  *res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
  655|  8.07M|  *res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
  656|  8.07M|}
warp_plane_avx2.c:store_vertical_filter_output_avx2:
  663|  8.07M|    const int round_bits) {
  664|  8.07M|  __m256i res_lo_1 = *res_lo;
  665|  8.07M|  __m256i res_hi_1 = *res_hi;
  666|       |
  667|  8.07M|  if (conv_params->is_compound) {
  ------------------
  |  Branch (667:7): [True: 212k, False: 7.86M]
  ------------------
  668|   212k|    __m128i *const p_0 =
  669|   212k|        (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
  670|   212k|    __m128i *const p_1 =
  671|   212k|        (__m128i *)&conv_params
  672|   212k|            ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j];
  673|       |
  674|   212k|    res_lo_1 = _mm256_srai_epi32(_mm256_add_epi32(res_lo_1, *res_add_const),
  675|   212k|                                 reduce_bits_vert);
  676|       |
  677|   212k|    const __m256i temp_lo_16 = _mm256_packus_epi32(res_lo_1, res_lo_1);
  678|   212k|    __m256i res_lo_16;
  679|   212k|    if (conv_params->do_average) {
  ------------------
  |  Branch (679:9): [True: 91.0k, False: 121k]
  ------------------
  680|  91.0k|      __m128i *const dst8_0 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
  681|  91.0k|      __m128i *const dst8_1 =
  682|  91.0k|          (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j];
  683|  91.0k|      const __m128i p_16_0 = _mm_loadl_epi64(p_0);
  684|  91.0k|      const __m128i p_16_1 = _mm_loadl_epi64(p_1);
  685|  91.0k|      const __m256i p_16 =
  686|  91.0k|          _mm256_inserti128_si256(_mm256_castsi128_si256(p_16_0), p_16_1, 1);
  687|  91.0k|      if (conv_params->use_dist_wtd_comp_avg) {
  ------------------
  |  Branch (687:11): [True: 48.3k, False: 42.7k]
  ------------------
  688|  48.3k|        const __m256i p_16_lo = _mm256_unpacklo_epi16(p_16, temp_lo_16);
  689|  48.3k|        const __m256i wt_res_lo = _mm256_madd_epi16(p_16_lo, *wt);
  690|  48.3k|        const __m256i shifted_32 =
  691|  48.3k|            _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
  ------------------
  |  |   76|  48.3k|#define DIST_PRECISION_BITS 4
  ------------------
  692|  48.3k|        res_lo_16 = _mm256_packus_epi32(shifted_32, shifted_32);
  693|  48.3k|      } else {
  694|  42.7k|        res_lo_16 = _mm256_srai_epi16(_mm256_add_epi16(p_16, temp_lo_16), 1);
  695|  42.7k|      }
  696|  91.0k|      res_lo_16 = _mm256_add_epi16(res_lo_16, *res_sub_const);
  697|  91.0k|      res_lo_16 = _mm256_srai_epi16(
  698|  91.0k|          _mm256_add_epi16(res_lo_16, *round_bits_const), round_bits);
  699|  91.0k|      const __m256i res_8_lo = _mm256_packus_epi16(res_lo_16, res_lo_16);
  700|  91.0k|      const __m128i res_8_lo_0 = _mm256_castsi256_si128(res_8_lo);
  701|  91.0k|      const __m128i res_8_lo_1 = _mm256_extracti128_si256(res_8_lo, 1);
  702|  91.0k|      *(int *)dst8_0 = _mm_cvtsi128_si32(res_8_lo_0);
  703|  91.0k|      *(int *)dst8_1 = _mm_cvtsi128_si32(res_8_lo_1);
  704|   121k|    } else {
  705|   121k|      const __m128i temp_lo_16_0 = _mm256_castsi256_si128(temp_lo_16);
  706|   121k|      const __m128i temp_lo_16_1 = _mm256_extracti128_si256(temp_lo_16, 1);
  707|   121k|      _mm_storel_epi64(p_0, temp_lo_16_0);
  708|   121k|      _mm_storel_epi64(p_1, temp_lo_16_1);
  709|   121k|    }
  710|   212k|    if (p_width > 4) {
  ------------------
  |  Branch (710:9): [True: 212k, False: 45]
  ------------------
  711|   212k|      __m128i *const p4_0 =
  712|   212k|          (__m128i *)&conv_params
  713|   212k|              ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
  714|   212k|      __m128i *const p4_1 =
  715|   212k|          (__m128i *)&conv_params
  716|   212k|              ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j + 4];
  717|   212k|      res_hi_1 = _mm256_srai_epi32(_mm256_add_epi32(res_hi_1, *res_add_const),
  718|   212k|                                   reduce_bits_vert);
  719|   212k|      const __m256i temp_hi_16 = _mm256_packus_epi32(res_hi_1, res_hi_1);
  720|   212k|      __m256i res_hi_16;
  721|   212k|      if (conv_params->do_average) {
  ------------------
  |  Branch (721:11): [True: 91.0k, False: 121k]
  ------------------
  722|  91.0k|        __m128i *const dst8_4_0 =
  723|  91.0k|            (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
  724|  91.0k|        __m128i *const dst8_4_1 =
  725|  91.0k|            (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j + 4];
  726|  91.0k|        const __m128i p4_16_0 = _mm_loadl_epi64(p4_0);
  727|  91.0k|        const __m128i p4_16_1 = _mm_loadl_epi64(p4_1);
  728|  91.0k|        const __m256i p4_16 = _mm256_inserti128_si256(
  729|  91.0k|            _mm256_castsi128_si256(p4_16_0), p4_16_1, 1);
  730|  91.0k|        if (conv_params->use_dist_wtd_comp_avg) {
  ------------------
  |  Branch (730:13): [True: 48.3k, False: 42.7k]
  ------------------
  731|  48.3k|          const __m256i p_16_hi = _mm256_unpacklo_epi16(p4_16, temp_hi_16);
  732|  48.3k|          const __m256i wt_res_hi = _mm256_madd_epi16(p_16_hi, *wt);
  733|  48.3k|          const __m256i shifted_32 =
  734|  48.3k|              _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
  ------------------
  |  |   76|  48.3k|#define DIST_PRECISION_BITS 4
  ------------------
  735|  48.3k|          res_hi_16 = _mm256_packus_epi32(shifted_32, shifted_32);
  736|  48.3k|        } else {
  737|  42.7k|          res_hi_16 = _mm256_srai_epi16(_mm256_add_epi16(p4_16, temp_hi_16), 1);
  738|  42.7k|        }
  739|  91.0k|        res_hi_16 = _mm256_add_epi16(res_hi_16, *res_sub_const);
  740|  91.0k|        res_hi_16 = _mm256_srai_epi16(
  741|  91.0k|            _mm256_add_epi16(res_hi_16, *round_bits_const), round_bits);
  742|  91.0k|        __m256i res_8_hi = _mm256_packus_epi16(res_hi_16, res_hi_16);
  743|  91.0k|        const __m128i res_8_hi_0 = _mm256_castsi256_si128(res_8_hi);
  744|  91.0k|        const __m128i res_8_hi_1 = _mm256_extracti128_si256(res_8_hi, 1);
  745|  91.0k|        *(int *)dst8_4_0 = _mm_cvtsi128_si32(res_8_hi_0);
  746|  91.0k|        *(int *)dst8_4_1 = _mm_cvtsi128_si32(res_8_hi_1);
  747|   121k|      } else {
  748|   121k|        const __m128i temp_hi_16_0 = _mm256_castsi256_si128(temp_hi_16);
  749|   121k|        const __m128i temp_hi_16_1 = _mm256_extracti128_si256(temp_hi_16, 1);
  750|   121k|        _mm_storel_epi64(p4_0, temp_hi_16_0);
  751|   121k|        _mm_storel_epi64(p4_1, temp_hi_16_1);
  752|   121k|      }
  753|   212k|    }
  754|  7.86M|  } else {
  755|  7.86M|    const __m256i res_lo_round = _mm256_srai_epi32(
  756|  7.86M|        _mm256_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
  757|  7.86M|    const __m256i res_hi_round = _mm256_srai_epi32(
  758|  7.86M|        _mm256_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
  759|       |
  760|  7.86M|    const __m256i res_16bit = _mm256_packs_epi32(res_lo_round, res_hi_round);
  761|  7.86M|    const __m256i res_8bit = _mm256_packus_epi16(res_16bit, res_16bit);
  762|  7.86M|    const __m128i res_8bit0 = _mm256_castsi256_si128(res_8bit);
  763|  7.86M|    const __m128i res_8bit1 = _mm256_extracti128_si256(res_8bit, 1);
  764|       |
  765|       |    // Store, blending with 'pred' if needed
  766|  7.86M|    __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
  767|  7.86M|    __m128i *const p1 = (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j];
  768|       |
  769|  7.86M|    if (p_width == 4) {
  ------------------
  |  Branch (769:9): [True: 0, False: 7.86M]
  ------------------
  770|      0|      *(int *)p = _mm_cvtsi128_si32(res_8bit0);
  771|      0|      *(int *)p1 = _mm_cvtsi128_si32(res_8bit1);
  772|  7.86M|    } else {
  773|  7.86M|      _mm_storel_epi64(p, res_8bit0);
  774|  7.86M|      _mm_storel_epi64(p1, res_8bit1);
  775|  7.86M|    }
  776|  7.86M|  }
  777|  8.07M|}
warp_plane_avx2.c:warp_vertical_filter_gamma0_avx2:
  834|   450k|    const __m256i *wt) {
  835|   450k|  (void)gamma;
  836|   450k|  int k, row = 0;
  837|   450k|  __m256i src[8];
  838|   450k|  const __m256i src_0 = horz_out[0];
  839|   450k|  const __m256i src_1 =
  840|   450k|      _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
  841|   450k|  const __m256i src_2 = horz_out[1];
  842|   450k|  const __m256i src_3 =
  843|   450k|      _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
  844|   450k|  const __m256i src_4 = horz_out[2];
  845|   450k|  const __m256i src_5 =
  846|   450k|      _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
  847|       |
  848|   450k|  src[0] = _mm256_unpacklo_epi16(src_0, src_1);
  849|   450k|  src[2] = _mm256_unpacklo_epi16(src_2, src_3);
  850|   450k|  src[4] = _mm256_unpacklo_epi16(src_4, src_5);
  851|       |
  852|   450k|  src[1] = _mm256_unpackhi_epi16(src_0, src_1);
  853|   450k|  src[3] = _mm256_unpackhi_epi16(src_2, src_3);
  854|   450k|  src[5] = _mm256_unpackhi_epi16(src_4, src_5);
  855|       |
  856|  2.25M|  for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
  ------------------
  |  |   34|  2.25M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.52M, False: 729k]
  |  |  ------------------
  ------------------
  |  Branch (856:16): [True: 1.80M, False: 450k]
  ------------------
  857|  1.80M|    int sy = sy4 + delta * (k + 4);
  858|  1.80M|    __m256i coeffs[8];
  859|  1.80M|    prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy, coeffs);
  860|  1.80M|    __m256i res_lo, res_hi;
  861|  1.80M|    filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
  862|  1.80M|                                    row);
  863|  1.80M|    store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
  864|  1.80M|                                      res_sub_const, round_bits_const, pred,
  865|  1.80M|                                      conv_params, i, j, k, reduce_bits_vert,
  866|  1.80M|                                      p_stride, p_width, round_bits);
  867|  1.80M|    src[0] = src[2];
  868|  1.80M|    src[2] = src[4];
  869|  1.80M|    src[4] = src[6];
  870|  1.80M|    src[1] = src[3];
  871|  1.80M|    src[3] = src[5];
  872|  1.80M|    src[5] = src[7];
  873|  1.80M|    row += 1;
  874|  1.80M|  }
  875|   450k|}
warp_plane_avx2.c:warp_vertical_filter_delta0_avx2:
  883|   172k|    const __m256i *wt) {
  884|   172k|  (void)delta;
  885|   172k|  int k, row = 0;
  886|   172k|  __m256i src[8], coeffs[8];
  887|   172k|  const __m256i src_0 = horz_out[0];
  888|   172k|  const __m256i src_1 =
  889|   172k|      _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
  890|   172k|  const __m256i src_2 = horz_out[1];
  891|   172k|  const __m256i src_3 =
  892|   172k|      _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
  893|   172k|  const __m256i src_4 = horz_out[2];
  894|   172k|  const __m256i src_5 =
  895|   172k|      _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
  896|       |
  897|   172k|  src[0] = _mm256_unpacklo_epi16(src_0, src_1);
  898|   172k|  src[2] = _mm256_unpacklo_epi16(src_2, src_3);
  899|   172k|  src[4] = _mm256_unpacklo_epi16(src_4, src_5);
  900|       |
  901|   172k|  src[1] = _mm256_unpackhi_epi16(src_0, src_1);
  902|   172k|  src[3] = _mm256_unpackhi_epi16(src_2, src_3);
  903|   172k|  src[5] = _mm256_unpackhi_epi16(src_4, src_5);
  904|       |
  905|   172k|  prepare_vertical_filter_coeffs_delta0_avx2(gamma, sy4, coeffs);
  906|       |
  907|   860k|  for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
  ------------------
  |  |   34|   860k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 732k, False: 128k]
  |  |  ------------------
  ------------------
  |  Branch (907:16): [True: 688k, False: 172k]
  ------------------
  908|   688k|    __m256i res_lo, res_hi;
  909|   688k|    filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
  910|   688k|                                    row);
  911|   688k|    store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
  912|   688k|                                      res_sub_const, round_bits_const, pred,
  913|   688k|                                      conv_params, i, j, k, reduce_bits_vert,
  914|   688k|                                      p_stride, p_width, round_bits);
  915|   688k|    src[0] = src[2];
  916|   688k|    src[2] = src[4];
  917|   688k|    src[4] = src[6];
  918|   688k|    src[1] = src[3];
  919|   688k|    src[3] = src[5];
  920|   688k|    src[5] = src[7];
  921|   688k|    row += 1;
  922|   688k|  }
  923|   172k|}
warp_plane_avx2.c:prepare_vertical_filter_coeffs_delta0_avx2:
  541|   172k|                                                              __m256i *coeffs) {
  542|   172k|  __m128i filt_00 =
  543|   172k|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  544|   172k|                                  ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|   172k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   172k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   172k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  545|   172k|  __m128i filt_01 =
  546|   172k|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  547|   172k|                                  ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|   172k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   172k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   172k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  548|   172k|  __m128i filt_02 =
  549|   172k|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  550|   172k|                                  ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|   172k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   172k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   172k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  551|   172k|  __m128i filt_03 =
  552|   172k|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  553|   172k|                                  ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|   172k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   172k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   172k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  554|       |
  555|   172k|  __m256i filt_0 = _mm256_broadcastsi128_si256(filt_00);
  556|   172k|  __m256i filt_1 = _mm256_broadcastsi128_si256(filt_01);
  557|   172k|  __m256i filt_2 = _mm256_broadcastsi128_si256(filt_02);
  558|   172k|  __m256i filt_3 = _mm256_broadcastsi128_si256(filt_03);
  559|       |
  560|   172k|  __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
  561|   172k|  __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
  562|   172k|  __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
  563|   172k|  __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
  564|       |
  565|   172k|  coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1);
  566|   172k|  coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1);
  567|   172k|  coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3);
  568|   172k|  coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3);
  569|       |
  570|   172k|  filt_00 =
  571|   172k|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  572|   172k|                                  ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|   172k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   172k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   172k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  573|   172k|  filt_01 =
  574|   172k|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  575|   172k|                                  ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|   172k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   172k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   172k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  576|   172k|  filt_02 =
  577|   172k|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  578|   172k|                                  ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|   172k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   172k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   172k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  579|   172k|  filt_03 =
  580|   172k|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  581|   172k|                                  ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|   172k|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|   172k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|   172k|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  582|       |
  583|   172k|  filt_0 = _mm256_broadcastsi128_si256(filt_00);
  584|   172k|  filt_1 = _mm256_broadcastsi128_si256(filt_01);
  585|   172k|  filt_2 = _mm256_broadcastsi128_si256(filt_02);
  586|   172k|  filt_3 = _mm256_broadcastsi128_si256(filt_03);
  587|       |
  588|   172k|  res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
  589|   172k|  res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
  590|   172k|  res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
  591|   172k|  res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
  592|       |
  593|   172k|  coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1);
  594|   172k|  coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1);
  595|   172k|  coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3);
  596|   172k|  coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3);
  597|   172k|}
warp_plane_avx2.c:warp_vertical_filter_avx2:
  785|   979k|    const __m256i *wt) {
  786|   979k|  int k, row = 0;
  787|   979k|  __m256i src[8];
  788|   979k|  const __m256i src_0 = horz_out[0];
  789|   979k|  const __m256i src_1 =
  790|   979k|      _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
  791|   979k|  const __m256i src_2 = horz_out[1];
  792|   979k|  const __m256i src_3 =
  793|   979k|      _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
  794|   979k|  const __m256i src_4 = horz_out[2];
  795|   979k|  const __m256i src_5 =
  796|   979k|      _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
  797|       |
  798|   979k|  src[0] = _mm256_unpacklo_epi16(src_0, src_1);
  799|   979k|  src[2] = _mm256_unpacklo_epi16(src_2, src_3);
  800|   979k|  src[4] = _mm256_unpacklo_epi16(src_4, src_5);
  801|       |
  802|   979k|  src[1] = _mm256_unpackhi_epi16(src_0, src_1);
  803|   979k|  src[3] = _mm256_unpackhi_epi16(src_2, src_3);
  804|   979k|  src[5] = _mm256_unpackhi_epi16(src_4, src_5);
  805|       |
  806|  4.89M|  for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
  ------------------
  |  |   34|  4.89M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 3.71M, False: 1.18M]
  |  |  ------------------
  ------------------
  |  Branch (806:16): [True: 3.91M, False: 979k]
  ------------------
  807|  3.91M|    int sy = sy4 + delta * (k + 4);
  808|  3.91M|    __m256i coeffs[8];
  809|  3.91M|    prepare_vertical_filter_coeffs_avx2(gamma, delta, sy, coeffs);
  810|  3.91M|    __m256i res_lo, res_hi;
  811|  3.91M|    filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
  812|  3.91M|                                    row);
  813|  3.91M|    store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
  814|  3.91M|                                      res_sub_const, round_bits_const, pred,
  815|  3.91M|                                      conv_params, i, j, k, reduce_bits_vert,
  816|  3.91M|                                      p_stride, p_width, round_bits);
  817|  3.91M|    src[0] = src[2];
  818|  3.91M|    src[2] = src[4];
  819|  3.91M|    src[4] = src[6];
  820|  3.91M|    src[1] = src[3];
  821|  3.91M|    src[3] = src[5];
  822|  3.91M|    src[5] = src[7];
  823|       |
  824|  3.91M|    row += 1;
  825|  3.91M|  }
  826|   979k|}
warp_plane_avx2.c:prepare_vertical_filter_coeffs_avx2:
  448|  3.91M|                                                       __m256i *coeffs) {
  449|  3.91M|  __m128i filt_00 =
  450|  3.91M|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  451|  3.91M|                                  ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  3.91M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.91M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.91M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  452|  3.91M|  __m128i filt_01 =
  453|  3.91M|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  454|  3.91M|                                  ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  3.91M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.91M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.91M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  455|  3.91M|  __m128i filt_02 =
  456|  3.91M|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  457|  3.91M|                                  ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  3.91M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.91M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.91M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  458|  3.91M|  __m128i filt_03 =
  459|  3.91M|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  460|  3.91M|                                  ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  3.91M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.91M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.91M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  461|       |
  462|  3.91M|  __m128i filt_10 = _mm_loadu_si128(
  463|  3.91M|      (__m128i *)(av1_warped_filter +
  464|  3.91M|                  (((sy + delta) + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  3.91M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.91M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.91M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  465|  3.91M|  __m128i filt_11 = _mm_loadu_si128(
  466|  3.91M|      (__m128i *)(av1_warped_filter +
  467|  3.91M|                  (((sy + delta) + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  3.91M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.91M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.91M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  468|  3.91M|  __m128i filt_12 = _mm_loadu_si128(
  469|  3.91M|      (__m128i *)(av1_warped_filter +
  470|  3.91M|                  (((sy + delta) + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  3.91M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.91M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.91M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  471|  3.91M|  __m128i filt_13 = _mm_loadu_si128(
  472|  3.91M|      (__m128i *)(av1_warped_filter +
  473|  3.91M|                  (((sy + delta) + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  3.91M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.91M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.91M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  474|       |
  475|  3.91M|  __m256i filt_0 =
  476|  3.91M|      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1);
  477|  3.91M|  __m256i filt_1 =
  478|  3.91M|      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1);
  479|  3.91M|  __m256i filt_2 =
  480|  3.91M|      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1);
  481|  3.91M|  __m256i filt_3 =
  482|  3.91M|      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1);
  483|       |
  484|  3.91M|  __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
  485|  3.91M|  __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
  486|  3.91M|  __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
  487|  3.91M|  __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
  488|       |
  489|  3.91M|  coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1);
  490|  3.91M|  coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1);
  491|  3.91M|  coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3);
  492|  3.91M|  coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3);
  493|       |
  494|  3.91M|  filt_00 =
  495|  3.91M|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  496|  3.91M|                                  ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  3.91M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.91M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.91M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  497|  3.91M|  filt_01 =
  498|  3.91M|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  499|  3.91M|                                  ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  3.91M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.91M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.91M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  500|  3.91M|  filt_02 =
  501|  3.91M|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  502|  3.91M|                                  ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  3.91M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.91M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.91M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  503|  3.91M|  filt_03 =
  504|  3.91M|      _mm_loadu_si128((__m128i *)(av1_warped_filter +
  505|  3.91M|                                  ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  3.91M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.91M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.91M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  506|       |
  507|  3.91M|  filt_10 = _mm_loadu_si128(
  508|  3.91M|      (__m128i *)(av1_warped_filter +
  509|  3.91M|                  (((sy + delta) + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  3.91M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.91M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.91M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  510|  3.91M|  filt_11 = _mm_loadu_si128(
  511|  3.91M|      (__m128i *)(av1_warped_filter +
  512|  3.91M|                  (((sy + delta) + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  3.91M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.91M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.91M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  513|  3.91M|  filt_12 = _mm_loadu_si128(
  514|  3.91M|      (__m128i *)(av1_warped_filter +
  515|  3.91M|                  (((sy + delta) + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  3.91M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.91M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.91M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  516|  3.91M|  filt_13 = _mm_loadu_si128(
  517|  3.91M|      (__m128i *)(av1_warped_filter +
  518|  3.91M|                  (((sy + delta) + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
  ------------------
  |  |  107|  3.91M|#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  3.91M|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
  |  |  ------------------
  |  |  |  |  102|  3.91M|#define WARPEDPIXEL_PREC_BITS 6
  |  |  ------------------
  ------------------
  519|       |
  520|  3.91M|  filt_0 =
  521|  3.91M|      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1);
  522|  3.91M|  filt_1 =
  523|  3.91M|      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1);
  524|  3.91M|  filt_2 =
  525|  3.91M|      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1);
  526|  3.91M|  filt_3 =
  527|  3.91M|      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1);
  528|       |
  529|  3.91M|  res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
  530|  3.91M|  res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
  531|  3.91M|  res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
  532|  3.91M|  res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
  533|       |
  534|  3.91M|  coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1);
  535|  3.91M|  coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1);
  536|  3.91M|  coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3);
  537|  3.91M|  coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3);
  538|  3.91M|}

av1_wiener_convolve_add_src_avx2:
   48|   304k|                                      const WienerConvolveParams *conv_params) {
   49|   304k|  const int bd = 8;
   50|   304k|  assert(x_step_q4 == 16 && y_step_q4 == 16);
   51|   304k|  assert(!(w & 7));
   52|   304k|  (void)x_step_q4;
   53|   304k|  (void)y_step_q4;
   54|       |
   55|   304k|  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS) * 8]);
  ------------------
  |  |   19|   304k|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
   56|   304k|  int im_h = h + SUBPEL_TAPS - 2;
  ------------------
  |  |   26|   304k|#define SUBPEL_TAPS 8
  ------------------
   57|   304k|  int im_stride = 8;
   58|   304k|  memset(im_block + (im_h * im_stride), 0, MAX_SB_SIZE);
  ------------------
  |  |   32|   304k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   304k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
   59|   304k|  int i, j;
   60|   304k|  const int center_tap = (SUBPEL_TAPS - 1) / 2;
  ------------------
  |  |   26|   304k|#define SUBPEL_TAPS 8
  ------------------
   61|   304k|  const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
   62|       |
   63|   304k|  __m256i filt[4], coeffs_h[4], coeffs_v[4], filt_center;
   64|       |
   65|   304k|  assert(conv_params->round_0 > 0);
   66|       |
   67|   304k|  filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
   68|   304k|  filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
   69|   304k|  filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
   70|   304k|  filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
   71|       |
   72|   304k|  filt_center = _mm256_load_si256((__m256i const *)filt_center_global_avx2);
   73|       |
   74|   304k|  const __m128i coeffs_x = _mm_loadu_si128((__m128i *)filter_x);
   75|   304k|  const __m256i filter_coeffs_x = _mm256_broadcastsi128_si256(coeffs_x);
   76|       |
   77|       |  // coeffs 0 1 0 1 0 1 0 1
   78|   304k|  coeffs_h[0] =
   79|   304k|      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0200u));
   80|       |  // coeffs 2 3 2 3 2 3 2 3
   81|   304k|  coeffs_h[1] =
   82|   304k|      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0604u));
   83|       |  // coeffs 4 5 4 5 4 5 4 5
   84|   304k|  coeffs_h[2] =
   85|   304k|      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0a08u));
   86|       |  // coeffs 6 7 6 7 6 7 6 7
   87|   304k|  coeffs_h[3] =
   88|   304k|      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0e0cu));
   89|       |
   90|   304k|  const __m256i round_const_h =
   91|   304k|      _mm256_set1_epi16((1 << (conv_params->round_0 - 1)));
   92|   304k|  const __m256i round_const_horz =
   93|   304k|      _mm256_set1_epi16((1 << (bd + FILTER_BITS - conv_params->round_0 - 1)));
  ------------------
  |  |   21|   304k|#define FILTER_BITS 7
  ------------------
   94|   304k|  const __m256i clamp_low = _mm256_setzero_si256();
   95|   304k|  const __m256i clamp_high =
   96|   304k|      _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
  ------------------
  |  |   43|   304k|#define WIENER_CLAMP_LIMIT(r0, bd) (1 << ((bd) + 1 + FILTER_BITS - r0))
  |  |  ------------------
  |  |  |  |   21|   304k|#define FILTER_BITS 7
  |  |  ------------------
  ------------------
   97|   304k|  const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0);
   98|       |
   99|       |  // Add an offset to account for the "add_src" part of the convolve function.
  100|   304k|  const __m128i zero_128 = _mm_setzero_si128();
  101|   304k|  const __m128i offset_0 = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
  102|   304k|  const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset_0);
  103|       |
  104|   304k|  const __m256i filter_coeffs_y = _mm256_broadcastsi128_si256(coeffs_y);
  105|       |
  106|       |  // coeffs 0 1 0 1 0 1 0 1
  107|   304k|  coeffs_v[0] = _mm256_shuffle_epi32(filter_coeffs_y, 0x00);
  108|       |  // coeffs 2 3 2 3 2 3 2 3
  109|   304k|  coeffs_v[1] = _mm256_shuffle_epi32(filter_coeffs_y, 0x55);
  110|       |  // coeffs 4 5 4 5 4 5 4 5
  111|   304k|  coeffs_v[2] = _mm256_shuffle_epi32(filter_coeffs_y, 0xaa);
  112|       |  // coeffs 6 7 6 7 6 7 6 7
  113|   304k|  coeffs_v[3] = _mm256_shuffle_epi32(filter_coeffs_y, 0xff);
  114|       |
  115|   304k|  const __m256i round_const_v =
  116|   304k|      _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
  117|   304k|                        (1 << (bd + conv_params->round_1 - 1)));
  118|   304k|  const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
  119|       |
  120|  1.57M|  for (j = 0; j < w; j += 8) {
  ------------------
  |  Branch (120:15): [True: 1.27M, False: 304k]
  ------------------
  121|  16.8M|    for (i = 0; i < im_h; i += 2) {
  ------------------
  |  Branch (121:17): [True: 15.5M, False: 1.27M]
  ------------------
  122|  15.5M|      __m256i data = _mm256_castsi128_si256(
  123|  15.5M|          _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
  124|       |
  125|       |      // Load the next line
  126|  15.5M|      if (i + 1 < im_h)
  ------------------
  |  Branch (126:11): [True: 16.3M, False: 18.4E]
  ------------------
  127|  16.3M|        data = _mm256_inserti128_si256(
  128|  15.5M|            data,
  129|  15.5M|            _mm_loadu_si128(
  130|  15.5M|                (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
  131|  15.5M|            1);
  132|       |
  133|  15.5M|      __m256i res = convolve_lowbd_x(data, coeffs_h, filt);
  134|       |
  135|  15.5M|      res =
  136|  15.5M|          _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h);
  137|       |
  138|  15.5M|      __m256i data_0 = _mm256_shuffle_epi8(data, filt_center);
  139|       |
  140|       |      // multiply the center pixel by 2^(FILTER_BITS - round_0) and add it to
  141|       |      // the result
  142|  15.5M|      data_0 = _mm256_slli_epi16(data_0, FILTER_BITS - conv_params->round_0);
  ------------------
  |  |   21|  15.5M|#define FILTER_BITS 7
  ------------------
  143|  15.5M|      res = _mm256_add_epi16(res, data_0);
  144|  15.5M|      res = _mm256_add_epi16(res, round_const_horz);
  145|  15.5M|      const __m256i res_clamped =
  146|  15.5M|          _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high);
  147|  15.5M|      _mm256_store_si256((__m256i *)&im_block[i * im_stride], res_clamped);
  148|  15.5M|    }
  149|       |
  150|       |    /* Vertical filter */
  151|  1.27M|    {
  152|  1.27M|      __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
  153|  1.27M|      __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
  154|  1.27M|      __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
  155|  1.27M|      __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
  156|  1.27M|      __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
  157|  1.27M|      __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
  158|       |
  159|  1.27M|      __m256i s[8];
  160|  1.27M|      s[0] = _mm256_unpacklo_epi16(src_0, src_1);
  161|  1.27M|      s[1] = _mm256_unpacklo_epi16(src_2, src_3);
  162|  1.27M|      s[2] = _mm256_unpacklo_epi16(src_4, src_5);
  163|       |
  164|  1.27M|      s[4] = _mm256_unpackhi_epi16(src_0, src_1);
  165|  1.27M|      s[5] = _mm256_unpackhi_epi16(src_2, src_3);
  166|  1.27M|      s[6] = _mm256_unpackhi_epi16(src_4, src_5);
  167|       |
  168|  20.3M|      for (i = 0; i < h - 1; i += 2) {
  ------------------
  |  Branch (168:19): [True: 19.1M, False: 1.27M]
  ------------------
  169|  19.1M|        const int16_t *data = &im_block[i * im_stride];
  170|       |
  171|  19.1M|        const __m256i s6 =
  172|  19.1M|            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
  173|  19.1M|        const __m256i s7 =
  174|  19.1M|            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
  175|       |
  176|  19.1M|        s[3] = _mm256_unpacklo_epi16(s6, s7);
  177|  19.1M|        s[7] = _mm256_unpackhi_epi16(s6, s7);
  178|       |
  179|  19.1M|        __m256i res_a = convolve(s, coeffs_v);
  180|  19.1M|        __m256i res_b = convolve(s + 4, coeffs_v);
  181|       |
  182|  19.1M|        const __m256i res_a_round = _mm256_sra_epi32(
  183|  19.1M|            _mm256_add_epi32(res_a, round_const_v), round_shift_v);
  184|  19.1M|        const __m256i res_b_round = _mm256_sra_epi32(
  185|  19.1M|            _mm256_add_epi32(res_b, round_const_v), round_shift_v);
  186|       |
  187|       |        /* rounding code */
  188|       |        // 16 bit conversion
  189|  19.1M|        const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
  190|       |        // 8 bit conversion and saturation to uint8
  191|  19.1M|        const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);
  192|       |
  193|  19.1M|        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
  194|  19.1M|        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
  195|       |
  196|       |        // Store values into the destination buffer
  197|  19.1M|        __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
  198|  19.1M|        __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
  199|       |
  200|  19.1M|        _mm_storel_epi64(p_0, res_0);
  201|  19.1M|        _mm_storel_epi64(p_1, res_1);
  202|       |
  203|  19.1M|        s[0] = s[1];
  204|  19.1M|        s[1] = s[2];
  205|  19.1M|        s[2] = s[3];
  206|       |
  207|  19.1M|        s[4] = s[5];
  208|  19.1M|        s[5] = s[6];
  209|  19.1M|        s[6] = s[7];
  210|  19.1M|      }
  211|  1.27M|      if (h - i) {
  ------------------
  |  Branch (211:11): [True: 2.48k, False: 1.26M]
  ------------------
  212|  2.48k|        s[0] = _mm256_permute2x128_si256(s[0], s[4], 0x20);
  213|  2.48k|        s[1] = _mm256_permute2x128_si256(s[1], s[5], 0x20);
  214|  2.48k|        s[2] = _mm256_permute2x128_si256(s[2], s[6], 0x20);
  215|       |
  216|  2.48k|        const int16_t *data = &im_block[i * im_stride];
  217|  2.48k|        const __m128i s6_ = _mm_loadu_si128((__m128i *)(data + 6 * im_stride));
  218|  2.48k|        const __m128i s7_ = _mm_loadu_si128((__m128i *)(data + 7 * im_stride));
  219|       |
  220|  2.48k|        __m128i s3 = _mm_unpacklo_epi16(s6_, s7_);
  221|  2.48k|        __m128i s7 = _mm_unpackhi_epi16(s6_, s7_);
  222|       |
  223|  2.48k|        s[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s7, 1);
  224|  2.48k|        __m256i convolveres = convolve(s, coeffs_v);
  225|       |
  226|  2.48k|        const __m256i res_round = _mm256_sra_epi32(
  227|  2.48k|            _mm256_add_epi32(convolveres, round_const_v), round_shift_v);
  228|       |
  229|       |        /* rounding code */
  230|       |        // 16 bit conversion
  231|  2.48k|        __m128i reslo = _mm256_castsi256_si128(res_round);
  232|  2.48k|        __m128i reshi = _mm256_extracti128_si256(res_round, 1);
  233|  2.48k|        const __m128i res_16bit = _mm_packus_epi32(reslo, reshi);
  234|       |
  235|       |        // 8 bit conversion and saturation to uint8
  236|  2.48k|        const __m128i res_8b = _mm_packus_epi16(res_16bit, res_16bit);
  237|  2.48k|        __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
  238|  2.48k|        _mm_storel_epi64(p_0, res_8b);
  239|  2.48k|      }
  240|  1.27M|    }
  241|  1.27M|  }
  242|   304k|}

av1_check_trailing_bits:
   89|  83.7k|int av1_check_trailing_bits(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) {
   90|       |  // bit_offset is set to 0 (mod 8) when the reader is already byte aligned
   91|  83.7k|  int bits_before_alignment = 8 - rb->bit_offset % 8;
   92|  83.7k|  int trailing = aom_rb_read_literal(rb, bits_before_alignment);
   93|  83.7k|  if (trailing != (1 << (bits_before_alignment - 1))) {
  ------------------
  |  Branch (93:7): [True: 6.63k, False: 77.1k]
  ------------------
   94|  6.63k|    pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
   95|  6.63k|    return -1;
   96|  6.63k|  }
   97|  77.1k|  return 0;
   98|  83.7k|}
av1_set_single_tile_decoding_mode:
 2159|  22.1k|void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm) {
 2160|  22.1k|  cm->tiles.single_tile_decoding = 0;
 2161|  22.1k|  if (cm->tiles.large_scale) {
  ------------------
  |  Branch (2161:7): [True: 22.1k, False: 0]
  ------------------
 2162|  22.1k|    struct loopfilter *lf = &cm->lf;
 2163|  22.1k|    RestorationInfo *const rst_info = cm->rst_info;
 2164|  22.1k|    const CdefInfo *const cdef_info = &cm->cdef_info;
 2165|       |
 2166|       |    // Figure out single_tile_decoding by loopfilter_level.
 2167|  22.1k|    const int no_loopfilter = !(lf->filter_level[0] || lf->filter_level[1]);
  ------------------
  |  Branch (2167:33): [True: 6.04k, False: 16.1k]
  |  Branch (2167:56): [True: 5.99k, False: 10.1k]
  ------------------
 2168|  22.1k|    const int no_cdef = cdef_info->cdef_bits == 0 &&
  ------------------
  |  Branch (2168:25): [True: 20.9k, False: 1.19k]
  ------------------
 2169|  22.1k|                        cdef_info->cdef_strengths[0] == 0 &&
  ------------------
  |  Branch (2169:25): [True: 15.5k, False: 5.41k]
  ------------------
 2170|  22.1k|                        cdef_info->cdef_uv_strengths[0] == 0;
  ------------------
  |  Branch (2170:25): [True: 14.7k, False: 840]
  ------------------
 2171|  22.1k|    const int no_restoration =
 2172|  22.1k|        rst_info[0].frame_restoration_type == RESTORE_NONE &&
  ------------------
  |  Branch (2172:9): [True: 16.1k, False: 6.00k]
  ------------------
 2173|  22.1k|        rst_info[1].frame_restoration_type == RESTORE_NONE &&
  ------------------
  |  Branch (2173:9): [True: 15.5k, False: 591]
  ------------------
 2174|  22.1k|        rst_info[2].frame_restoration_type == RESTORE_NONE;
  ------------------
  |  Branch (2174:9): [True: 15.1k, False: 423]
  ------------------
 2175|  22.1k|    assert(IMPLIES(cm->features.coded_lossless, no_loopfilter && no_cdef));
 2176|  22.1k|    assert(IMPLIES(cm->features.all_lossless, no_restoration));
 2177|  22.1k|    cm->tiles.single_tile_decoding = no_loopfilter && no_cdef && no_restoration;
  ------------------
  |  Branch (2177:38): [True: 10.1k, False: 12.0k]
  |  Branch (2177:55): [True: 7.16k, False: 2.94k]
  |  Branch (2177:66): [True: 7.07k, False: 94]
  ------------------
 2178|  22.1k|  }
 2179|  22.1k|}
av1_dec_row_mt_dealloc:
 2542|  62.3k|void av1_dec_row_mt_dealloc(AV1DecRowMTSync *dec_row_mt_sync) {
 2543|  62.3k|  if (dec_row_mt_sync != NULL) {
  ------------------
  |  Branch (2543:7): [True: 62.3k, False: 0]
  ------------------
 2544|  62.3k|#if CONFIG_MULTITHREAD
 2545|  62.3k|    int i;
 2546|  62.3k|    if (dec_row_mt_sync->mutex_ != NULL) {
  ------------------
  |  Branch (2546:9): [True: 27.5k, False: 34.7k]
  ------------------
 2547|  80.1k|      for (i = 0; i < dec_row_mt_sync->allocated_sb_rows; ++i) {
  ------------------
  |  Branch (2547:19): [True: 52.6k, False: 27.5k]
  ------------------
 2548|  52.6k|        pthread_mutex_destroy(&dec_row_mt_sync->mutex_[i]);
 2549|  52.6k|      }
 2550|  27.5k|      aom_free(dec_row_mt_sync->mutex_);
 2551|  27.5k|    }
 2552|  62.3k|    if (dec_row_mt_sync->cond_ != NULL) {
  ------------------
  |  Branch (2552:9): [True: 27.5k, False: 34.7k]
  ------------------
 2553|  80.1k|      for (i = 0; i < dec_row_mt_sync->allocated_sb_rows; ++i) {
  ------------------
  |  Branch (2553:19): [True: 52.6k, False: 27.5k]
  ------------------
 2554|  52.6k|        pthread_cond_destroy(&dec_row_mt_sync->cond_[i]);
 2555|  52.6k|      }
 2556|  27.5k|      aom_free(dec_row_mt_sync->cond_);
 2557|  27.5k|    }
 2558|  62.3k|#endif  // CONFIG_MULTITHREAD
 2559|  62.3k|    aom_free(dec_row_mt_sync->cur_sb_col);
 2560|       |
 2561|       |    // clear the structure as the source of this call may be a resize in which
 2562|       |    // case this call will be followed by an _alloc() which may fail.
 2563|  62.3k|    av1_zero(*dec_row_mt_sync);
  ------------------
  |  |   43|  62.3k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
 2564|  62.3k|  }
 2565|  62.3k|}
av1_free_mc_tmp_buf:
 3378|   313k|void av1_free_mc_tmp_buf(ThreadData *thread_data) {
 3379|   313k|  int ref;
 3380|   939k|  for (ref = 0; ref < 2; ref++) {
  ------------------
  |  Branch (3380:17): [True: 626k, False: 313k]
  ------------------
 3381|   626k|    if (thread_data->mc_buf_use_highbd)
  ------------------
  |  Branch (3381:9): [True: 170k, False: 455k]
  ------------------
 3382|   170k|      aom_free(CONVERT_TO_SHORTPTR(thread_data->mc_buf[ref]));
  ------------------
  |  |   75|   170k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
 3383|   455k|    else
 3384|   455k|      aom_free(thread_data->mc_buf[ref]);
 3385|   626k|    thread_data->mc_buf[ref] = NULL;
 3386|   626k|  }
 3387|   313k|  thread_data->mc_buf_size = 0;
 3388|   313k|  thread_data->mc_buf_use_highbd = 0;
 3389|       |
 3390|   313k|  aom_free(thread_data->tmp_conv_dst);
 3391|   313k|  thread_data->tmp_conv_dst = NULL;
 3392|   313k|  aom_free(thread_data->seg_mask);
 3393|   313k|  thread_data->seg_mask = NULL;
 3394|   939k|  for (int i = 0; i < 2; ++i) {
  ------------------
  |  Branch (3394:19): [True: 626k, False: 313k]
  ------------------
 3395|   626k|    aom_free(thread_data->tmp_obmc_bufs[i]);
 3396|   626k|    thread_data->tmp_obmc_bufs[i] = NULL;
 3397|   626k|  }
 3398|   313k|}
av1_read_color_config:
 4086|  80.5k|                           struct aom_internal_error_info *error_info) {
 4087|  80.5k|  read_bitdepth(rb, seq_params, error_info);
 4088|       |
 4089|  80.5k|  seq_params->use_highbitdepth =
 4090|  80.5k|      seq_params->bit_depth > AOM_BITS_8 || !allow_lowbitdepth;
  ------------------
  |  Branch (4090:7): [True: 54.0k, False: 26.5k]
  |  Branch (4090:45): [True: 0, False: 26.5k]
  ------------------
 4091|       |  // monochrome bit (not needed for PROFILE_1)
 4092|  80.5k|  const int is_monochrome =
 4093|  80.5k|      seq_params->profile != PROFILE_1 ? aom_rb_read_bit(rb) : 0;
  ------------------
  |  Branch (4093:7): [True: 68.2k, False: 12.3k]
  ------------------
 4094|  80.5k|  seq_params->monochrome = is_monochrome;
 4095|  80.5k|  int color_description_present_flag = aom_rb_read_bit(rb);
 4096|  80.5k|  if (color_description_present_flag) {
  ------------------
  |  Branch (4096:7): [True: 3.65k, False: 76.9k]
  ------------------
 4097|  3.65k|    seq_params->color_primaries = aom_rb_read_literal(rb, 8);
 4098|  3.65k|    seq_params->transfer_characteristics = aom_rb_read_literal(rb, 8);
 4099|  3.65k|    seq_params->matrix_coefficients = aom_rb_read_literal(rb, 8);
 4100|  76.9k|  } else {
 4101|  76.9k|    seq_params->color_primaries = AOM_CICP_CP_UNSPECIFIED;
 4102|  76.9k|    seq_params->transfer_characteristics = AOM_CICP_TC_UNSPECIFIED;
 4103|  76.9k|    seq_params->matrix_coefficients = AOM_CICP_MC_UNSPECIFIED;
 4104|  76.9k|  }
 4105|  80.5k|  if (is_monochrome) {
  ------------------
  |  Branch (4105:7): [True: 16.0k, False: 64.5k]
  ------------------
 4106|       |    // [16,235] (including xvycc) vs [0,255] range
 4107|  16.0k|    seq_params->color_range = aom_rb_read_bit(rb);
 4108|  16.0k|    seq_params->subsampling_y = seq_params->subsampling_x = 1;
 4109|  16.0k|    seq_params->chroma_sample_position = AOM_CSP_UNKNOWN;
 4110|  16.0k|    seq_params->separate_uv_delta_q = 0;
 4111|  16.0k|    return;
 4112|  16.0k|  }
 4113|  64.5k|  if (seq_params->color_primaries == AOM_CICP_CP_BT_709 &&
  ------------------
  |  Branch (4113:7): [True: 42, False: 64.4k]
  ------------------
 4114|  64.5k|      seq_params->transfer_characteristics == AOM_CICP_TC_SRGB &&
  ------------------
  |  Branch (4114:7): [True: 3, False: 39]
  ------------------
 4115|  64.5k|      seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) {
  ------------------
  |  Branch (4115:7): [True: 2, False: 1]
  ------------------
 4116|      2|    seq_params->subsampling_y = seq_params->subsampling_x = 0;
 4117|      2|    seq_params->color_range = 1;  // assume full color-range
 4118|      2|    if (!(seq_params->profile == PROFILE_1 ||
  ------------------
  |  Branch (4118:11): [True: 0, False: 2]
  ------------------
 4119|      2|          (seq_params->profile == PROFILE_2 &&
  ------------------
  |  Branch (4119:12): [True: 1, False: 1]
  ------------------
 4120|      2|           seq_params->bit_depth == AOM_BITS_12))) {
  ------------------
  |  Branch (4120:12): [True: 0, False: 1]
  ------------------
 4121|      2|      aom_internal_error(
 4122|      2|          error_info, AOM_CODEC_UNSUP_BITSTREAM,
 4123|      2|          "sRGB colorspace not compatible with specified profile");
 4124|      2|    }
 4125|  64.5k|  } else {
 4126|       |    // [16,235] (including xvycc) vs [0,255] range
 4127|  64.5k|    seq_params->color_range = aom_rb_read_bit(rb);
 4128|  64.5k|    if (seq_params->profile == PROFILE_0) {
  ------------------
  |  Branch (4128:9): [True: 27.4k, False: 37.0k]
  ------------------
 4129|       |      // 420 only
 4130|  27.4k|      seq_params->subsampling_x = seq_params->subsampling_y = 1;
 4131|  37.0k|    } else if (seq_params->profile == PROFILE_1) {
  ------------------
  |  Branch (4131:16): [True: 12.0k, False: 24.9k]
  ------------------
 4132|       |      // 444 only
 4133|  12.0k|      seq_params->subsampling_x = seq_params->subsampling_y = 0;
 4134|  24.9k|    } else {
 4135|  24.9k|      assert(seq_params->profile == PROFILE_2);
 4136|  23.5k|      if (seq_params->bit_depth == AOM_BITS_12) {
  ------------------
  |  Branch (4136:11): [True: 19.0k, False: 4.58k]
  ------------------
 4137|  19.0k|        seq_params->subsampling_x = aom_rb_read_bit(rb);
 4138|  19.0k|        if (seq_params->subsampling_x)
  ------------------
  |  Branch (4138:13): [True: 1.67k, False: 17.3k]
  ------------------
 4139|  1.67k|          seq_params->subsampling_y = aom_rb_read_bit(rb);  // 422 or 420
 4140|  17.3k|        else
 4141|  17.3k|          seq_params->subsampling_y = 0;  // 444
 4142|  19.0k|      } else {
 4143|       |        // 422
 4144|  4.58k|        seq_params->subsampling_x = 1;
 4145|  4.58k|        seq_params->subsampling_y = 0;
 4146|  4.58k|      }
 4147|  23.5k|    }
 4148|  63.1k|    if (seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY &&
  ------------------
  |  Branch (4148:9): [True: 690, False: 62.4k]
  ------------------
 4149|  63.1k|        (seq_params->subsampling_x || seq_params->subsampling_y)) {
  ------------------
  |  Branch (4149:10): [True: 10, False: 680]
  |  Branch (4149:39): [True: 0, False: 680]
  ------------------
 4150|     10|      aom_internal_error(
 4151|     10|          error_info, AOM_CODEC_UNSUP_BITSTREAM,
 4152|     10|          "Identity CICP Matrix incompatible with non 4:4:4 color sampling");
 4153|     10|    }
 4154|  63.1k|    if (seq_params->subsampling_x && seq_params->subsampling_y) {
  ------------------
  |  Branch (4154:9): [True: 33.7k, False: 29.4k]
  |  Branch (4154:38): [True: 27.8k, False: 5.91k]
  ------------------
 4155|  27.8k|      seq_params->chroma_sample_position = aom_rb_read_literal(rb, 2);
 4156|  27.8k|    }
 4157|  63.1k|  }
 4158|  63.1k|  seq_params->separate_uv_delta_q = aom_rb_read_bit(rb);
 4159|  63.1k|}
av1_read_timing_info_header:
 4163|  3.66k|                                 struct aom_read_bit_buffer *rb) {
 4164|  3.66k|  timing_info->num_units_in_display_tick =
 4165|  3.66k|      aom_rb_read_unsigned_literal(rb,
 4166|  3.66k|                                   32);  // Number of units in a display tick
 4167|  3.66k|  timing_info->time_scale = aom_rb_read_unsigned_literal(rb, 32);  // Time scale
 4168|  3.66k|  if (timing_info->num_units_in_display_tick == 0 ||
  ------------------
  |  Branch (4168:7): [True: 128, False: 3.53k]
  ------------------
 4169|  3.66k|      timing_info->time_scale == 0) {
  ------------------
  |  Branch (4169:7): [True: 195, False: 3.34k]
  ------------------
 4170|    196|    aom_internal_error(
 4171|    196|        error, AOM_CODEC_UNSUP_BITSTREAM,
 4172|    196|        "num_units_in_display_tick and time_scale must be greater than 0.");
 4173|    196|  }
 4174|  3.66k|  timing_info->equal_picture_interval =
 4175|  3.66k|      aom_rb_read_bit(rb);  // Equal picture interval bit
 4176|  3.66k|  if (timing_info->equal_picture_interval) {
  ------------------
  |  Branch (4176:7): [True: 2.15k, False: 1.51k]
  ------------------
 4177|  2.15k|    const uint32_t num_ticks_per_picture_minus_1 = aom_rb_read_uvlc(rb);
 4178|  2.15k|    if (num_ticks_per_picture_minus_1 == UINT32_MAX) {
  ------------------
  |  Branch (4178:9): [True: 28, False: 2.12k]
  ------------------
 4179|     28|      aom_internal_error(
 4180|     28|          error, AOM_CODEC_UNSUP_BITSTREAM,
 4181|     28|          "num_ticks_per_picture_minus_1 cannot be (1 << 32) - 1.");
 4182|     28|    }
 4183|  2.15k|    timing_info->num_ticks_per_picture = num_ticks_per_picture_minus_1 + 1;
 4184|  2.15k|  }
 4185|  3.66k|}
av1_read_decoder_model_info:
 4188|  1.09k|                                 struct aom_read_bit_buffer *rb) {
 4189|  1.09k|  decoder_model_info->encoder_decoder_buffer_delay_length =
 4190|  1.09k|      aom_rb_read_literal(rb, 5) + 1;
 4191|  1.09k|  decoder_model_info->num_units_in_decoding_tick =
 4192|  1.09k|      aom_rb_read_unsigned_literal(rb,
 4193|  1.09k|                                   32);  // Number of units in a decoding tick
 4194|  1.09k|  decoder_model_info->buffer_removal_time_length =
 4195|  1.09k|      aom_rb_read_literal(rb, 5) + 1;
 4196|  1.09k|  decoder_model_info->frame_presentation_time_length =
 4197|  1.09k|      aom_rb_read_literal(rb, 5) + 1;
 4198|  1.09k|}
av1_read_op_parameters_info:
 4202|  2.06k|                                 struct aom_read_bit_buffer *rb) {
 4203|  2.06k|  op_params->decoder_buffer_delay =
 4204|  2.06k|      aom_rb_read_unsigned_literal(rb, buffer_delay_length);
 4205|  2.06k|  op_params->encoder_buffer_delay =
 4206|  2.06k|      aom_rb_read_unsigned_literal(rb, buffer_delay_length);
 4207|  2.06k|  op_params->low_delay_mode_flag = aom_rb_read_bit(rb);
 4208|  2.06k|}
av1_read_sequence_header:
 4217|  81.2k|                              SequenceHeader *seq_params) {
 4218|  81.2k|  const int num_bits_width = aom_rb_read_literal(rb, 4) + 1;
 4219|  81.2k|  const int num_bits_height = aom_rb_read_literal(rb, 4) + 1;
 4220|  81.2k|  const int max_frame_width = aom_rb_read_literal(rb, num_bits_width) + 1;
 4221|  81.2k|  const int max_frame_height = aom_rb_read_literal(rb, num_bits_height) + 1;
 4222|       |
 4223|  81.2k|  seq_params->num_bits_width = num_bits_width;
 4224|  81.2k|  seq_params->num_bits_height = num_bits_height;
 4225|  81.2k|  seq_params->max_frame_width = max_frame_width;
 4226|  81.2k|  seq_params->max_frame_height = max_frame_height;
 4227|       |
 4228|  81.2k|  if (seq_params->reduced_still_picture_hdr) {
  ------------------
  |  Branch (4228:7): [True: 17.6k, False: 63.6k]
  ------------------
 4229|  17.6k|    seq_params->frame_id_numbers_present_flag = 0;
 4230|  63.6k|  } else {
 4231|  63.6k|    seq_params->frame_id_numbers_present_flag = aom_rb_read_bit(rb);
 4232|  63.6k|  }
 4233|  81.2k|  if (seq_params->frame_id_numbers_present_flag) {
  ------------------
  |  Branch (4233:7): [True: 17.9k, False: 63.3k]
  ------------------
 4234|       |    // We must always have delta_frame_id_length < frame_id_length,
 4235|       |    // in order for a frame to be referenced with a unique delta.
 4236|       |    // Avoid wasting bits by using a coding that enforces this restriction.
 4237|  17.9k|    seq_params->delta_frame_id_length = aom_rb_read_literal(rb, 4) + 2;
 4238|  17.9k|    seq_params->frame_id_length =
 4239|  17.9k|        aom_rb_read_literal(rb, 3) + seq_params->delta_frame_id_length + 1;
 4240|  17.9k|    if (seq_params->frame_id_length > 16)
  ------------------
  |  Branch (4240:9): [True: 452, False: 17.4k]
  ------------------
 4241|    452|      aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
 4242|    452|                         "Invalid frame_id_length");
 4243|  17.9k|  }
 4244|       |
 4245|  81.2k|  setup_sb_size(seq_params, rb);
 4246|       |
 4247|  81.2k|  seq_params->enable_filter_intra = aom_rb_read_bit(rb);
 4248|  81.2k|  seq_params->enable_intra_edge_filter = aom_rb_read_bit(rb);
 4249|       |
 4250|  81.2k|  if (seq_params->reduced_still_picture_hdr) {
  ------------------
  |  Branch (4250:7): [True: 17.6k, False: 63.6k]
  ------------------
 4251|  17.6k|    seq_params->enable_interintra_compound = 0;
 4252|  17.6k|    seq_params->enable_masked_compound = 0;
 4253|  17.6k|    seq_params->enable_warped_motion = 0;
 4254|  17.6k|    seq_params->enable_dual_filter = 0;
 4255|  17.6k|    seq_params->order_hint_info.enable_order_hint = 0;
 4256|  17.6k|    seq_params->order_hint_info.enable_dist_wtd_comp = 0;
 4257|  17.6k|    seq_params->order_hint_info.enable_ref_frame_mvs = 0;
 4258|  17.6k|    seq_params->force_screen_content_tools = 2;  // SELECT_SCREEN_CONTENT_TOOLS
 4259|  17.6k|    seq_params->force_integer_mv = 2;            // SELECT_INTEGER_MV
 4260|  17.6k|    seq_params->order_hint_info.order_hint_bits_minus_1 = -1;
 4261|  63.6k|  } else {
 4262|  63.6k|    seq_params->enable_interintra_compound = aom_rb_read_bit(rb);
 4263|  63.6k|    seq_params->enable_masked_compound = aom_rb_read_bit(rb);
 4264|  63.6k|    seq_params->enable_warped_motion = aom_rb_read_bit(rb);
 4265|  63.6k|    seq_params->enable_dual_filter = aom_rb_read_bit(rb);
 4266|       |
 4267|  63.6k|    seq_params->order_hint_info.enable_order_hint = aom_rb_read_bit(rb);
 4268|  63.6k|    seq_params->order_hint_info.enable_dist_wtd_comp =
 4269|  63.6k|        seq_params->order_hint_info.enable_order_hint ? aom_rb_read_bit(rb) : 0;
  ------------------
  |  Branch (4269:9): [True: 49.5k, False: 14.0k]
  ------------------
 4270|  63.6k|    seq_params->order_hint_info.enable_ref_frame_mvs =
 4271|  63.6k|        seq_params->order_hint_info.enable_order_hint ? aom_rb_read_bit(rb) : 0;
  ------------------
  |  Branch (4271:9): [True: 49.5k, False: 14.0k]
  ------------------
 4272|       |
 4273|  63.6k|    if (aom_rb_read_bit(rb)) {
  ------------------
  |  Branch (4273:9): [True: 40.2k, False: 23.3k]
  ------------------
 4274|  40.2k|      seq_params->force_screen_content_tools =
 4275|  40.2k|          2;  // SELECT_SCREEN_CONTENT_TOOLS
 4276|  40.2k|    } else {
 4277|  23.3k|      seq_params->force_screen_content_tools = aom_rb_read_bit(rb);
 4278|  23.3k|    }
 4279|       |
 4280|  63.6k|    if (seq_params->force_screen_content_tools > 0) {
  ------------------
  |  Branch (4280:9): [True: 58.2k, False: 5.36k]
  ------------------
 4281|  58.2k|      if (aom_rb_read_bit(rb)) {
  ------------------
  |  Branch (4281:11): [True: 46.3k, False: 11.9k]
  ------------------
 4282|  46.3k|        seq_params->force_integer_mv = 2;  // SELECT_INTEGER_MV
 4283|  46.3k|      } else {
 4284|  11.9k|        seq_params->force_integer_mv = aom_rb_read_bit(rb);
 4285|  11.9k|      }
 4286|  58.2k|    } else {
 4287|  5.36k|      seq_params->force_integer_mv = 2;  // SELECT_INTEGER_MV
 4288|  5.36k|    }
 4289|  63.6k|    seq_params->order_hint_info.order_hint_bits_minus_1 =
 4290|  63.6k|        seq_params->order_hint_info.enable_order_hint
  ------------------
  |  Branch (4290:9): [True: 49.5k, False: 14.0k]
  ------------------
 4291|  63.6k|            ? aom_rb_read_literal(rb, 3)
 4292|  63.6k|            : -1;
 4293|  63.6k|  }
 4294|       |
 4295|  81.2k|  seq_params->enable_superres = aom_rb_read_bit(rb);
 4296|  81.2k|  seq_params->enable_cdef = aom_rb_read_bit(rb);
 4297|  81.2k|  seq_params->enable_restoration = aom_rb_read_bit(rb);
 4298|  81.2k|}
av1_init_read_bit_buffer:
 5150|   503k|    const uint8_t *data_end) {
 5151|   503k|  rb->bit_offset = 0;
 5152|   503k|  rb->error_handler = error_handler;
 5153|   503k|  rb->error_handler_data = &pbi->common;
 5154|   503k|  rb->bit_buffer = data;
 5155|   503k|  rb->bit_buffer_end = data_end;
 5156|   503k|  return rb;
 5157|   503k|}
av1_read_profile:
 5159|   207k|BITSTREAM_PROFILE av1_read_profile(struct aom_read_bit_buffer *rb) {
 5160|   207k|  int profile = aom_rb_read_literal(rb, PROFILE_BITS);
  ------------------
  |  |   79|   207k|#define PROFILE_BITS 3
  ------------------
 5161|   207k|  return (BITSTREAM_PROFILE)profile;
 5162|   207k|}
av1_decode_frame_headers_and_setup:
 5176|   251k|                                            int trailing_bits_present) {
 5177|   251k|  AV1_COMMON *const cm = &pbi->common;
 5178|   251k|  const int num_planes = av1_num_planes(cm);
 5179|   251k|  MACROBLOCKD *const xd = &pbi->dcb.xd;
 5180|       |
 5181|       |#if CONFIG_BITSTREAM_DEBUG
 5182|       |  if (cm->seq_params->order_hint_info.enable_order_hint) {
 5183|       |    aom_bitstream_queue_set_frame_read(cm->current_frame.order_hint * 2 +
 5184|       |                                       cm->show_frame);
 5185|       |  } else {
 5186|       |    // This is currently used in RTC encoding. cm->show_frame is always 1.
 5187|       |    assert(cm->show_frame);
 5188|       |    aom_bitstream_queue_set_frame_read(cm->current_frame.frame_number);
 5189|       |  }
 5190|       |#endif
 5191|       |#if CONFIG_MISMATCH_DEBUG
 5192|       |  mismatch_move_frame_idx_r();
 5193|       |#endif
 5194|       |
 5195|  2.01M|  for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
  ------------------
  |  Branch (5195:28): [True: 1.76M, False: 251k]
  ------------------
 5196|  1.76M|    cm->global_motion[i] = default_warp_params;
 5197|  1.76M|    cm->cur_frame->global_motion[i] = default_warp_params;
 5198|  1.76M|  }
 5199|   251k|  xd->global_motion = cm->global_motion;
 5200|       |
 5201|   251k|  read_uncompressed_header(pbi, rb);
 5202|       |
 5203|   251k|  if (trailing_bits_present) av1_check_trailing_bits(pbi, rb);
  ------------------
  |  Branch (5203:7): [True: 947, False: 251k]
  ------------------
 5204|       |
 5205|   251k|  if (!cm->tiles.single_tile_decoding &&
  ------------------
  |  Branch (5205:7): [True: 169k, False: 82.1k]
  ------------------
 5206|   251k|      (pbi->dec_tile_row >= 0 || pbi->dec_tile_col >= 0)) {
  ------------------
  |  Branch (5206:8): [True: 0, False: 169k]
  |  Branch (5206:34): [True: 0, False: 169k]
  ------------------
 5207|      0|    pbi->dec_tile_row = -1;
 5208|      0|    pbi->dec_tile_col = -1;
 5209|      0|  }
 5210|       |
 5211|   251k|  const uint32_t uncomp_hdr_size =
 5212|   251k|      (uint32_t)aom_rb_bytes_read(rb);  // Size of the uncompressed header
 5213|   251k|  YV12_BUFFER_CONFIG *new_fb = &cm->cur_frame->buf;
 5214|   251k|  xd->cur_buf = new_fb;
 5215|   251k|  if (av1_allow_intrabc(cm)) {
  ------------------
  |  Branch (5215:7): [True: 42.1k, False: 209k]
  ------------------
 5216|  42.1k|    av1_setup_scale_factors_for_frame(
 5217|  42.1k|        &cm->sf_identity, xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height,
 5218|  42.1k|        xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height);
 5219|  42.1k|  }
 5220|       |
 5221|       |  // Showing a frame directly.
 5222|   251k|  if (cm->show_existing_frame) {
  ------------------
  |  Branch (5222:7): [True: 807, False: 251k]
  ------------------
 5223|    807|    if (pbi->reset_decoder_state) {
  ------------------
  |  Branch (5223:9): [True: 176, False: 631]
  ------------------
 5224|       |      // Use the default frame context values.
 5225|    176|      *cm->fc = *cm->default_frame_context;
 5226|    176|      if (!cm->fc->initialized)
  ------------------
  |  Branch (5226:11): [True: 0, False: 176]
  ------------------
 5227|      0|        aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 5228|      0|                           "Uninitialized entropy context.");
 5229|    176|    }
 5230|    807|    return uncomp_hdr_size;
 5231|    807|  }
 5232|       |
 5233|   251k|  cm->mi_params.setup_mi(&cm->mi_params);
 5234|       |
 5235|   251k|  av1_calculate_ref_frame_side(cm);
 5236|   251k|  if (cm->features.allow_ref_frame_mvs) av1_setup_motion_field(cm);
  ------------------
  |  Branch (5236:7): [True: 15.9k, False: 235k]
  ------------------
 5237|       |
 5238|   251k|  av1_setup_block_planes(xd, cm->seq_params->subsampling_x,
 5239|   251k|                         cm->seq_params->subsampling_y, num_planes);
 5240|   251k|  if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) {
  ------------------
  |  |   66|   251k|#define PRIMARY_REF_NONE 7
  ------------------
  |  Branch (5240:7): [True: 140k, False: 111k]
  ------------------
 5241|       |    // use the default frame context values
 5242|   140k|    *cm->fc = *cm->default_frame_context;
 5243|   140k|  } else {
 5244|   111k|    *cm->fc = get_primary_ref_frame_buf(cm)->frame_context;
 5245|   111k|  }
 5246|   251k|  if (!cm->fc->initialized)
  ------------------
  |  Branch (5246:7): [True: 26, False: 251k]
  ------------------
 5247|     26|    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 5248|     26|                       "Uninitialized entropy context.");
 5249|       |
 5250|   251k|  pbi->dcb.corrupted = 0;
 5251|   251k|  return uncomp_hdr_size;
 5252|   251k|}
av1_decode_tg_tiles_and_wrapup:
 5278|   152k|                                    int end_tile, int initialize_flag) {
 5279|   152k|  AV1_COMMON *const cm = &pbi->common;
 5280|   152k|  CommonTileParams *const tiles = &cm->tiles;
 5281|   152k|  MACROBLOCKD *const xd = &pbi->dcb.xd;
 5282|   152k|  const int tile_count_tg = end_tile - start_tile + 1;
 5283|       |
 5284|   152k|  xd->error_info = cm->error;
 5285|   152k|  if (initialize_flag) setup_frame_info(pbi);
  ------------------
  |  Branch (5285:7): [True: 152k, False: 0]
  ------------------
 5286|   152k|  const int num_planes = av1_num_planes(cm);
 5287|       |
 5288|   152k|  if (pbi->max_threads > 1 && !(tiles->large_scale && !pbi->ext_tile_debug) &&
  ------------------
  |  Branch (5288:7): [True: 73.1k, False: 79.6k]
  |  Branch (5288:33): [True: 7.91k, False: 65.2k]
  |  Branch (5288:55): [True: 0, False: 7.91k]
  ------------------
 5289|   152k|      pbi->row_mt)
  ------------------
  |  Branch (5289:7): [True: 73.1k, False: 0]
  ------------------
 5290|  73.1k|    *p_data_end =
 5291|  73.1k|        decode_tiles_row_mt(pbi, data, data_end, start_tile, end_tile);
 5292|  79.6k|  else if (pbi->max_threads > 1 && tile_count_tg > 1 &&
  ------------------
  |  Branch (5292:12): [True: 0, False: 79.6k]
  |  Branch (5292:36): [True: 0, False: 0]
  ------------------
 5293|  79.6k|           !(tiles->large_scale && !pbi->ext_tile_debug))
  ------------------
  |  Branch (5293:14): [True: 0, False: 0]
  |  Branch (5293:36): [True: 0, False: 0]
  ------------------
 5294|      0|    *p_data_end = decode_tiles_mt(pbi, data, data_end, start_tile, end_tile);
 5295|  79.6k|  else
 5296|  79.6k|    *p_data_end = decode_tiles(pbi, data, data_end, start_tile, end_tile);
 5297|       |
 5298|       |  // If the bit stream is monochrome, set the U and V buffers to a constant.
 5299|   152k|  if (num_planes < 3) {
  ------------------
  |  Branch (5299:7): [True: 10.1k, False: 142k]
  ------------------
 5300|  10.1k|    set_planes_to_neutral_grey(cm->seq_params, xd->cur_buf, 1);
 5301|  10.1k|  }
 5302|       |
 5303|   152k|  if (end_tile != tiles->rows * tiles->cols - 1) {
  ------------------
  |  Branch (5303:7): [True: 5, False: 152k]
  ------------------
 5304|      5|    return;
 5305|      5|  }
 5306|       |
 5307|   152k|  av1_alloc_cdef_buffers(cm, &pbi->cdef_worker, &pbi->cdef_sync,
 5308|   152k|                         pbi->num_workers, 1);
 5309|   152k|  av1_alloc_cdef_sync(cm, &pbi->cdef_sync, pbi->num_workers);
 5310|       |
 5311|   152k|  if (!cm->features.allow_intrabc && !tiles->single_tile_decoding) {
  ------------------
  |  Branch (5311:7): [True: 79.0k, False: 73.7k]
  |  Branch (5311:38): [True: 77.7k, False: 1.26k]
  ------------------
 5312|  77.7k|    if (cm->lf.filter_level[0] || cm->lf.filter_level[1]) {
  ------------------
  |  Branch (5312:9): [True: 23.6k, False: 54.0k]
  |  Branch (5312:35): [True: 6.32k, False: 47.7k]
  ------------------
 5313|  30.0k|      av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, &pbi->dcb.xd, 0,
 5314|  30.0k|                               num_planes, 0, pbi->tile_workers,
 5315|  30.0k|                               pbi->num_workers, &pbi->lf_row_sync, 0);
 5316|  30.0k|    }
 5317|       |
 5318|  77.7k|    const int do_cdef =
 5319|  77.7k|        !pbi->skip_loop_filter && !cm->features.coded_lossless &&
  ------------------
  |  Branch (5319:9): [True: 77.7k, False: 0]
  |  Branch (5319:35): [True: 59.6k, False: 18.1k]
  ------------------
 5320|  77.7k|        (cm->cdef_info.cdef_bits || cm->cdef_info.cdef_strengths[0] ||
  ------------------
  |  Branch (5320:10): [True: 11.5k, False: 48.0k]
  |  Branch (5320:37): [True: 10.3k, False: 37.7k]
  ------------------
 5321|  59.6k|         cm->cdef_info.cdef_uv_strengths[0]);
  ------------------
  |  Branch (5321:10): [True: 3.44k, False: 34.3k]
  ------------------
 5322|  77.7k|    const int do_superres = av1_superres_scaled(cm);
 5323|  77.7k|    const int optimized_loop_restoration = !do_cdef && !do_superres;
  ------------------
  |  Branch (5323:44): [True: 52.4k, False: 25.3k]
  |  Branch (5323:56): [True: 46.9k, False: 5.52k]
  ------------------
 5324|  77.7k|    const int do_loop_restoration =
 5325|  77.7k|        cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
  ------------------
  |  Branch (5325:9): [True: 20.0k, False: 57.6k]
  ------------------
 5326|  77.7k|        cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
  ------------------
  |  Branch (5326:9): [True: 1.70k, False: 55.9k]
  ------------------
 5327|  77.7k|        cm->rst_info[2].frame_restoration_type != RESTORE_NONE;
  ------------------
  |  Branch (5327:9): [True: 298, False: 55.6k]
  ------------------
 5328|       |    // Frame border extension is not required in the decoder
 5329|       |    // as it happens in extend_mc_border().
 5330|  77.7k|    int do_extend_border_mt = 0;
 5331|  77.7k|    if (!optimized_loop_restoration) {
  ------------------
  |  Branch (5331:9): [True: 30.8k, False: 46.9k]
  ------------------
 5332|  30.8k|      if (do_loop_restoration)
  ------------------
  |  Branch (5332:11): [True: 19.4k, False: 11.4k]
  ------------------
 5333|  19.4k|        av1_loop_restoration_save_boundary_lines(&pbi->common.cur_frame->buf,
 5334|  19.4k|                                                 cm, 0);
 5335|       |
 5336|  30.8k|      if (do_cdef) {
  ------------------
  |  Branch (5336:11): [True: 25.3k, False: 5.52k]
  ------------------
 5337|  25.3k|        if (pbi->num_workers > 1) {
  ------------------
  |  Branch (5337:13): [True: 19.3k, False: 6.00k]
  ------------------
 5338|  19.3k|          av1_cdef_frame_mt(cm, &pbi->dcb.xd, pbi->cdef_worker,
 5339|  19.3k|                            pbi->tile_workers, &pbi->cdef_sync,
 5340|  19.3k|                            pbi->num_workers, av1_cdef_init_fb_row_mt,
 5341|  19.3k|                            do_extend_border_mt);
 5342|  19.3k|        } else {
 5343|  6.00k|          av1_cdef_frame(&pbi->common.cur_frame->buf, cm, &pbi->dcb.xd,
 5344|  6.00k|                         av1_cdef_init_fb_row);
 5345|  6.00k|        }
 5346|  25.3k|      }
 5347|       |
 5348|  30.8k|      superres_post_decode(pbi);
 5349|       |
 5350|  30.8k|      if (do_loop_restoration) {
  ------------------
  |  Branch (5350:11): [True: 19.4k, False: 11.4k]
  ------------------
 5351|  19.4k|        av1_loop_restoration_save_boundary_lines(&pbi->common.cur_frame->buf,
 5352|  19.4k|                                                 cm, 1);
 5353|  19.4k|        if (pbi->num_workers > 1) {
  ------------------
  |  Branch (5353:13): [True: 14.7k, False: 4.63k]
  ------------------
 5354|  14.7k|          av1_loop_restoration_filter_frame_mt(
 5355|  14.7k|              (YV12_BUFFER_CONFIG *)xd->cur_buf, cm, optimized_loop_restoration,
 5356|  14.7k|              pbi->tile_workers, pbi->num_workers, &pbi->lr_row_sync,
 5357|  14.7k|              &pbi->lr_ctxt, do_extend_border_mt);
 5358|  14.7k|        } else {
 5359|  4.63k|          av1_loop_restoration_filter_frame((YV12_BUFFER_CONFIG *)xd->cur_buf,
 5360|  4.63k|                                            cm, optimized_loop_restoration,
 5361|  4.63k|                                            &pbi->lr_ctxt);
 5362|  4.63k|        }
 5363|  19.4k|      }
 5364|  46.9k|    } else {
 5365|       |      // In no cdef and no superres case. Provide an optimized version of
 5366|       |      // loop_restoration_filter.
 5367|  46.9k|      if (do_loop_restoration) {
  ------------------
  |  Branch (5367:11): [True: 2.68k, False: 44.2k]
  ------------------
 5368|  2.68k|        if (pbi->num_workers > 1) {
  ------------------
  |  Branch (5368:13): [True: 910, False: 1.77k]
  ------------------
 5369|    910|          av1_loop_restoration_filter_frame_mt(
 5370|    910|              (YV12_BUFFER_CONFIG *)xd->cur_buf, cm, optimized_loop_restoration,
 5371|    910|              pbi->tile_workers, pbi->num_workers, &pbi->lr_row_sync,
 5372|    910|              &pbi->lr_ctxt, do_extend_border_mt);
 5373|  1.77k|        } else {
 5374|  1.77k|          av1_loop_restoration_filter_frame((YV12_BUFFER_CONFIG *)xd->cur_buf,
 5375|  1.77k|                                            cm, optimized_loop_restoration,
 5376|  1.77k|                                            &pbi->lr_ctxt);
 5377|  1.77k|        }
 5378|  2.68k|      }
 5379|  46.9k|    }
 5380|  77.7k|  }
 5381|       |
 5382|   152k|  if (!pbi->dcb.corrupted) {
  ------------------
  |  Branch (5382:7): [True: 91.1k, False: 61.6k]
  ------------------
 5383|  91.1k|    if (cm->features.refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
  ------------------
  |  Branch (5383:9): [True: 27.5k, False: 63.5k]
  ------------------
 5384|  27.5k|      assert(pbi->context_update_tile_id < pbi->allocated_tiles);
 5385|  27.5k|      *cm->fc = pbi->tile_data[pbi->context_update_tile_id].tctx;
 5386|  27.5k|      av1_reset_cdf_symbol_counters(cm->fc);
 5387|  27.5k|    }
 5388|  91.1k|  } else {
 5389|  61.6k|    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 5390|  61.6k|                       "Decode failed. Frame data is corrupted.");
 5391|  61.6k|  }
 5392|       |
 5393|       |#if CONFIG_INSPECTION
 5394|       |  if (pbi->inspect_cb != NULL) {
 5395|       |    (*pbi->inspect_cb)(pbi, pbi->inspect_ctx);
 5396|       |  }
 5397|       |#endif
 5398|       |
 5399|       |  // Non frame parallel update frame context here.
 5400|   152k|  if (!tiles->large_scale) {
  ------------------
  |  Branch (5400:7): [True: 83.0k, False: 69.7k]
  ------------------
 5401|  83.0k|    cm->cur_frame->frame_context = *cm->fc;
 5402|  83.0k|  }
 5403|       |
 5404|   152k|  if (cm->show_frame && !cm->seq_params->order_hint_info.enable_order_hint) {
  ------------------
  |  Branch (5404:7): [True: 67.5k, False: 85.2k]
  |  Branch (5404:25): [True: 9.72k, False: 57.8k]
  ------------------
 5405|  9.72k|    ++cm->current_frame.frame_number;
 5406|  9.72k|  }
 5407|   152k|}
decodeframe.c:read_bitdepth:
 3888|  80.5k|                                 struct aom_internal_error_info *error_info) {
 3889|  80.5k|  const int high_bitdepth = aom_rb_read_bit(rb);
 3890|  80.5k|  if (seq_params->profile == PROFILE_2 && high_bitdepth) {
  ------------------
  |  Branch (3890:7): [True: 35.5k, False: 45.0k]
  |  Branch (3890:43): [True: 30.7k, False: 4.78k]
  ------------------
 3891|  30.7k|    const int twelve_bit = aom_rb_read_bit(rb);
 3892|  30.7k|    seq_params->bit_depth = twelve_bit ? AOM_BITS_12 : AOM_BITS_10;
  ------------------
  |  Branch (3892:29): [True: 29.7k, False: 1.05k]
  ------------------
 3893|  49.8k|  } else if (seq_params->profile <= PROFILE_2) {
  ------------------
  |  Branch (3893:14): [True: 49.5k, False: 218]
  ------------------
 3894|  49.5k|    seq_params->bit_depth = high_bitdepth ? AOM_BITS_10 : AOM_BITS_8;
  ------------------
  |  Branch (3894:29): [True: 23.0k, False: 26.5k]
  ------------------
 3895|  49.5k|  } else {
 3896|    218|    aom_internal_error(error_info, AOM_CODEC_UNSUP_BITSTREAM,
 3897|    218|                       "Unsupported profile/bit-depth combination");
 3898|    218|  }
 3899|       |#if !CONFIG_AV1_HIGHBITDEPTH
 3900|       |  if (seq_params->bit_depth > AOM_BITS_8) {
 3901|       |    aom_internal_error(error_info, AOM_CODEC_UNSUP_BITSTREAM,
 3902|       |                       "Bit-depth %d not supported", seq_params->bit_depth);
 3903|       |  }
 3904|       |#endif
 3905|  80.5k|}
decodeframe.c:setup_sb_size:
 2005|  80.6k|                                 struct aom_read_bit_buffer *rb) {
 2006|  80.6k|  set_sb_size(seq_params, aom_rb_read_bit(rb) ? BLOCK_128X128 : BLOCK_64X64);
  ------------------
  |  Branch (2006:27): [True: 69.3k, False: 11.2k]
  ------------------
 2007|  80.6k|}
decodeframe.c:error_handler:
 3877|  14.6k|static inline void error_handler(void *data) {
 3878|  14.6k|  AV1_COMMON *const cm = (AV1_COMMON *)data;
 3879|  14.6k|  aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, "Truncated packet");
 3880|  14.6k|}
decodeframe.c:read_uncompressed_header:
 4487|   251k|                                    struct aom_read_bit_buffer *rb) {
 4488|   251k|  AV1_COMMON *const cm = &pbi->common;
 4489|   251k|  const SequenceHeader *const seq_params = cm->seq_params;
 4490|   251k|  CurrentFrame *const current_frame = &cm->current_frame;
 4491|   251k|  FeatureFlags *const features = &cm->features;
 4492|   251k|  MACROBLOCKD *const xd = &pbi->dcb.xd;
 4493|   251k|  BufferPool *const pool = cm->buffer_pool;
 4494|   251k|  RefCntBuffer *const frame_bufs = pool->frame_bufs;
 4495|   251k|  aom_s_frame_info *sframe_info = &pbi->sframe_info;
 4496|   251k|  sframe_info->is_s_frame = 0;
 4497|   251k|  sframe_info->is_s_frame_at_altref = 0;
 4498|       |
 4499|   251k|  if (!pbi->sequence_header_ready) {
  ------------------
  |  Branch (4499:7): [True: 817, False: 251k]
  ------------------
 4500|    817|    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 4501|    817|                       "No sequence header");
 4502|    817|  }
 4503|       |
 4504|   251k|  if (seq_params->reduced_still_picture_hdr) {
  ------------------
  |  Branch (4504:7): [True: 37.3k, False: 214k]
  ------------------
 4505|  37.3k|    cm->show_existing_frame = 0;
 4506|  37.3k|    cm->show_frame = 1;
 4507|  37.3k|    current_frame->frame_type = KEY_FRAME;
 4508|  37.3k|    if (pbi->sequence_header_changed) {
  ------------------
  |  Branch (4508:9): [True: 6.01k, False: 31.3k]
  ------------------
 4509|       |      // This is the start of a new coded video sequence.
 4510|  6.01k|      pbi->sequence_header_changed = 0;
 4511|  6.01k|      pbi->decoding_first_frame = 1;
 4512|  6.01k|      reset_frame_buffers(cm);
 4513|  6.01k|    }
 4514|  37.3k|    features->error_resilient_mode = 1;
 4515|   214k|  } else {
 4516|   214k|    cm->show_existing_frame = aom_rb_read_bit(rb);
 4517|   214k|    pbi->reset_decoder_state = 0;
 4518|       |
 4519|   214k|    if (cm->show_existing_frame) {
  ------------------
  |  Branch (4519:9): [True: 11.7k, False: 202k]
  ------------------
 4520|  11.7k|      if (pbi->sequence_header_changed) {
  ------------------
  |  Branch (4520:11): [True: 563, False: 11.1k]
  ------------------
 4521|    563|        aom_internal_error(
 4522|    563|            &pbi->error, AOM_CODEC_CORRUPT_FRAME,
 4523|    563|            "New sequence header starts with a show_existing_frame.");
 4524|    563|      }
 4525|       |      // Show an existing frame directly.
 4526|  11.7k|      const int existing_frame_idx = aom_rb_read_literal(rb, 3);
 4527|  11.7k|      RefCntBuffer *const frame_to_show = cm->ref_frame_map[existing_frame_idx];
 4528|  11.7k|      if (frame_to_show == NULL) {
  ------------------
  |  Branch (4528:11): [True: 6.09k, False: 5.64k]
  ------------------
 4529|  6.09k|        aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
 4530|  6.09k|                           "Buffer does not contain a decoded frame");
 4531|  6.09k|      }
 4532|  11.7k|      if (seq_params->decoder_model_info_present_flag &&
  ------------------
  |  Branch (4532:11): [True: 216, False: 11.5k]
  ------------------
 4533|  11.7k|          seq_params->timing_info.equal_picture_interval == 0) {
  ------------------
  |  Branch (4533:11): [True: 111, False: 105]
  ------------------
 4534|    111|        read_temporal_point_info(cm, rb);
 4535|    111|      }
 4536|  11.7k|      if (seq_params->frame_id_numbers_present_flag) {
  ------------------
  |  Branch (4536:11): [True: 4.23k, False: 7.49k]
  ------------------
 4537|  4.23k|        int frame_id_length = seq_params->frame_id_length;
 4538|  4.23k|        int display_frame_id = aom_rb_read_literal(rb, frame_id_length);
 4539|       |        /* Compare display_frame_id with ref_frame_id and check valid for
 4540|       |         * referencing */
 4541|  4.23k|        if (display_frame_id != cm->ref_frame_id[existing_frame_idx] ||
  ------------------
  |  Branch (4541:13): [True: 3.37k, False: 855]
  ------------------
 4542|  4.23k|            pbi->valid_for_referencing[existing_frame_idx] == 0)
  ------------------
  |  Branch (4542:13): [True: 100, False: 755]
  ------------------
 4543|  3.47k|          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 4544|  3.47k|                             "Reference buffer frame ID mismatch");
 4545|  4.23k|      }
 4546|  11.7k|      lock_buffer_pool(pool);
 4547|  11.7k|      assert(frame_to_show->ref_count > 0);
 4548|       |      // cm->cur_frame should be the buffer referenced by the return value
 4549|       |      // of the get_free_fb() call in assign_cur_frame_new_fb() (called by
 4550|       |      // av1_receive_compressed_data()), so the ref_count should be 1.
 4551|  1.60k|      assert(cm->cur_frame->ref_count == 1);
 4552|       |      // assign_frame_buffer_p() decrements ref_count directly rather than
 4553|       |      // call decrease_ref_count(). If cm->cur_frame->raw_frame_buffer has
 4554|       |      // already been allocated, it will not be released by
 4555|       |      // assign_frame_buffer_p()!
 4556|  1.60k|      assert(!cm->cur_frame->raw_frame_buffer.data);
 4557|  1.60k|      assign_frame_buffer_p(&cm->cur_frame, frame_to_show);
 4558|  1.60k|      pbi->reset_decoder_state = frame_to_show->frame_type == KEY_FRAME;
 4559|  1.60k|      unlock_buffer_pool(pool);
 4560|       |
 4561|  1.60k|      cm->lf.filter_level[0] = 0;
 4562|  1.60k|      cm->lf.filter_level[1] = 0;
 4563|  1.60k|      cm->show_frame = 1;
 4564|  1.60k|      current_frame->order_hint = frame_to_show->order_hint;
 4565|       |
 4566|       |      // Section 6.8.2: It is a requirement of bitstream conformance that when
 4567|       |      // show_existing_frame is used to show a previous frame, that the value
 4568|       |      // of showable_frame for the previous frame was equal to 1.
 4569|  1.60k|      if (!frame_to_show->showable_frame) {
  ------------------
  |  Branch (4569:11): [True: 793, False: 807]
  ------------------
 4570|    793|        aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
 4571|    793|                           "Buffer does not contain a showable frame");
 4572|    793|      }
 4573|       |      // Section 6.8.2: It is a requirement of bitstream conformance that when
 4574|       |      // show_existing_frame is used to show a previous frame with
 4575|       |      // RefFrameType[ frame_to_show_map_idx ] equal to KEY_FRAME, that the
 4576|       |      // frame is output via the show_existing_frame mechanism at most once.
 4577|  1.60k|      if (pbi->reset_decoder_state) frame_to_show->showable_frame = 0;
  ------------------
  |  Branch (4577:11): [True: 176, False: 1.42k]
  ------------------
 4578|       |
 4579|  1.60k|      cm->film_grain_params = frame_to_show->film_grain_params;
 4580|       |
 4581|  1.60k|      if (pbi->reset_decoder_state) {
  ------------------
  |  Branch (4581:11): [True: 176, False: 1.42k]
  ------------------
 4582|    176|        show_existing_frame_reset(pbi, existing_frame_idx);
 4583|  1.42k|      } else {
 4584|  1.42k|        current_frame->refresh_frame_flags = 0;
 4585|  1.42k|      }
 4586|       |
 4587|  1.60k|      return 0;
 4588|  1.60k|    }
 4589|       |
 4590|   202k|    current_frame->frame_type = (FRAME_TYPE)aom_rb_read_literal(rb, 2);
 4591|   202k|    if (pbi->sequence_header_changed) {
  ------------------
  |  Branch (4591:9): [True: 24.3k, False: 178k]
  ------------------
 4592|  24.3k|      if (current_frame->frame_type == KEY_FRAME) {
  ------------------
  |  Branch (4592:11): [True: 21.5k, False: 2.74k]
  ------------------
 4593|       |        // This is the start of a new coded video sequence.
 4594|  21.5k|        pbi->sequence_header_changed = 0;
 4595|  21.5k|        pbi->decoding_first_frame = 1;
 4596|  21.5k|        reset_frame_buffers(cm);
 4597|  21.5k|      } else {
 4598|  2.74k|        aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 4599|  2.74k|                           "Sequence header has changed without a keyframe.");
 4600|  2.74k|      }
 4601|  24.3k|    }
 4602|       |
 4603|   202k|    cm->show_frame = aom_rb_read_bit(rb);
 4604|   202k|    if (cm->show_frame == 0) pbi->is_arf_frame_present = 1;
  ------------------
  |  Branch (4604:9): [True: 69.4k, False: 133k]
  ------------------
 4605|   202k|    if (cm->show_frame == 0 && cm->current_frame.frame_type == KEY_FRAME)
  ------------------
  |  Branch (4605:9): [True: 69.4k, False: 133k]
  |  Branch (4605:32): [True: 48.6k, False: 20.8k]
  ------------------
 4606|  48.6k|      pbi->is_fwd_kf_present = 1;
 4607|   202k|    if (cm->current_frame.frame_type == S_FRAME) {
  ------------------
  |  Branch (4607:9): [True: 167, False: 202k]
  ------------------
 4608|    167|      sframe_info->is_s_frame = 1;
 4609|    167|      sframe_info->is_s_frame_at_altref = cm->show_frame ? 0 : 1;
  ------------------
  |  Branch (4609:43): [True: 84, False: 83]
  ------------------
 4610|    167|    }
 4611|   202k|    if (seq_params->still_picture &&
  ------------------
  |  Branch (4611:9): [True: 3.03k, False: 199k]
  ------------------
 4612|   202k|        (current_frame->frame_type != KEY_FRAME || !cm->show_frame)) {
  ------------------
  |  Branch (4612:10): [True: 103, False: 2.93k]
  |  Branch (4612:52): [True: 217, False: 2.71k]
  ------------------
 4613|    320|      aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 4614|    320|                         "Still pictures must be coded as shown keyframes");
 4615|    320|    }
 4616|   202k|    cm->showable_frame = current_frame->frame_type != KEY_FRAME;
 4617|   202k|    if (cm->show_frame) {
  ------------------
  |  Branch (4617:9): [True: 129k, False: 73.1k]
  ------------------
 4618|   129k|      if (seq_params->decoder_model_info_present_flag &&
  ------------------
  |  Branch (4618:11): [True: 20.8k, False: 108k]
  ------------------
 4619|   129k|          seq_params->timing_info.equal_picture_interval == 0)
  ------------------
  |  Branch (4619:11): [True: 2.07k, False: 18.7k]
  ------------------
 4620|  2.07k|        read_temporal_point_info(cm, rb);
 4621|   129k|    } else {
 4622|       |      // See if this frame can be used as show_existing_frame in future
 4623|  73.1k|      cm->showable_frame = aom_rb_read_bit(rb);
 4624|  73.1k|    }
 4625|   202k|    cm->cur_frame->showable_frame = cm->showable_frame;
 4626|   202k|    features->error_resilient_mode =
 4627|   202k|        frame_is_sframe(cm) ||
  ------------------
  |  Branch (4627:9): [True: 4.10k, False: 198k]
  ------------------
 4628|   202k|                (current_frame->frame_type == KEY_FRAME && cm->show_frame)
  ------------------
  |  Branch (4628:18): [True: 76.5k, False: 122k]
  |  Branch (4628:60): [True: 28.1k, False: 48.4k]
  ------------------
 4629|   202k|            ? 1
 4630|   202k|            : aom_rb_read_bit(rb);
 4631|   202k|  }
 4632|       |
 4633|   240k|  if (current_frame->frame_type == KEY_FRAME && cm->show_frame) {
  ------------------
  |  Branch (4633:7): [True: 113k, False: 126k]
  |  Branch (4633:49): [True: 65.4k, False: 48.4k]
  ------------------
 4634|       |    /* All frames need to be marked as not valid for referencing */
 4635|   589k|    for (int i = 0; i < REF_FRAMES; i++) {
  ------------------
  |  Branch (4635:21): [True: 523k, False: 65.4k]
  ------------------
 4636|   523k|      pbi->valid_for_referencing[i] = 0;
 4637|   523k|    }
 4638|  65.4k|  }
 4639|   240k|  features->disable_cdf_update = aom_rb_read_bit(rb);
 4640|   240k|  if (seq_params->force_screen_content_tools == 2) {
  ------------------
  |  Branch (4640:7): [True: 176k, False: 64.0k]
  ------------------
 4641|   176k|    features->allow_screen_content_tools = aom_rb_read_bit(rb);
 4642|   176k|  } else {
 4643|  64.0k|    features->allow_screen_content_tools =
 4644|  64.0k|        seq_params->force_screen_content_tools;
 4645|  64.0k|  }
 4646|       |
 4647|   240k|  if (features->allow_screen_content_tools) {
  ------------------
  |  Branch (4647:7): [True: 119k, False: 120k]
  ------------------
 4648|   119k|    if (seq_params->force_integer_mv == 2) {
  ------------------
  |  Branch (4648:9): [True: 87.4k, False: 31.8k]
  ------------------
 4649|  87.4k|      features->cur_frame_force_integer_mv = aom_rb_read_bit(rb);
 4650|  87.4k|    } else {
 4651|  31.8k|      features->cur_frame_force_integer_mv = seq_params->force_integer_mv;
 4652|  31.8k|    }
 4653|   120k|  } else {
 4654|   120k|    features->cur_frame_force_integer_mv = 0;
 4655|   120k|  }
 4656|       |
 4657|   240k|  int frame_size_override_flag = 0;
 4658|   240k|  features->allow_intrabc = 0;
 4659|   240k|  features->primary_ref_frame = PRIMARY_REF_NONE;
  ------------------
  |  |   66|   240k|#define PRIMARY_REF_NONE 7
  ------------------
 4660|       |
 4661|   240k|  if (!seq_params->reduced_still_picture_hdr) {
  ------------------
  |  Branch (4661:7): [True: 198k, False: 41.2k]
  ------------------
 4662|   198k|    if (seq_params->frame_id_numbers_present_flag) {
  ------------------
  |  Branch (4662:9): [True: 34.9k, False: 164k]
  ------------------
 4663|  34.9k|      int frame_id_length = seq_params->frame_id_length;
 4664|  34.9k|      int diff_len = seq_params->delta_frame_id_length;
 4665|  34.9k|      int prev_frame_id = 0;
 4666|  34.9k|      int have_prev_frame_id =
 4667|  34.9k|          !pbi->decoding_first_frame &&
  ------------------
  |  Branch (4667:11): [True: 9.12k, False: 25.7k]
  ------------------
 4668|  34.9k|          !(current_frame->frame_type == KEY_FRAME && cm->show_frame);
  ------------------
  |  Branch (4668:13): [True: 2.79k, False: 6.32k]
  |  Branch (4668:55): [True: 2.06k, False: 725]
  ------------------
 4669|  34.9k|      if (have_prev_frame_id) {
  ------------------
  |  Branch (4669:11): [True: 7.05k, False: 27.8k]
  ------------------
 4670|  7.05k|        prev_frame_id = cm->current_frame_id;
 4671|  7.05k|      }
 4672|  34.9k|      cm->current_frame_id = aom_rb_read_literal(rb, frame_id_length);
 4673|       |
 4674|  34.9k|      if (have_prev_frame_id) {
  ------------------
  |  Branch (4674:11): [True: 7.05k, False: 27.8k]
  ------------------
 4675|  7.05k|        int diff_frame_id;
 4676|  7.05k|        if (cm->current_frame_id > prev_frame_id) {
  ------------------
  |  Branch (4676:13): [True: 2.67k, False: 4.37k]
  ------------------
 4677|  2.67k|          diff_frame_id = cm->current_frame_id - prev_frame_id;
 4678|  4.37k|        } else {
 4679|  4.37k|          diff_frame_id =
 4680|  4.37k|              (1 << frame_id_length) + cm->current_frame_id - prev_frame_id;
 4681|  4.37k|        }
 4682|       |        /* Check current_frame_id for conformance */
 4683|  7.05k|        if (prev_frame_id == cm->current_frame_id ||
  ------------------
  |  Branch (4683:13): [True: 2.03k, False: 5.01k]
  ------------------
 4684|  7.05k|            diff_frame_id >= (1 << (frame_id_length - 1))) {
  ------------------
  |  Branch (4684:13): [True: 2.26k, False: 2.74k]
  ------------------
 4685|  4.30k|          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 4686|  4.30k|                             "Invalid value of current_frame_id");
 4687|  4.30k|        }
 4688|  7.05k|      }
 4689|       |      /* Check if some frames need to be marked as not valid for referencing */
 4690|   279k|      for (int i = 0; i < REF_FRAMES; i++) {
  ------------------
  |  Branch (4690:23): [True: 244k, False: 34.9k]
  ------------------
 4691|   244k|        if (cm->current_frame_id - (1 << diff_len) > 0) {
  ------------------
  |  Branch (4691:13): [True: 123k, False: 121k]
  ------------------
 4692|   123k|          if (cm->ref_frame_id[i] > cm->current_frame_id ||
  ------------------
  |  Branch (4692:15): [True: 30.1k, False: 93.0k]
  ------------------
 4693|   123k|              cm->ref_frame_id[i] < cm->current_frame_id - (1 << diff_len))
  ------------------
  |  Branch (4693:15): [True: 53.5k, False: 39.5k]
  ------------------
 4694|  83.7k|            pbi->valid_for_referencing[i] = 0;
 4695|   123k|        } else {
 4696|   121k|          if (cm->ref_frame_id[i] > cm->current_frame_id &&
  ------------------
  |  Branch (4696:15): [True: 50.5k, False: 70.9k]
  ------------------
 4697|   121k|              cm->ref_frame_id[i] < (1 << frame_id_length) +
  ------------------
  |  Branch (4697:15): [True: 40.9k, False: 9.58k]
  ------------------
 4698|  50.5k|                                        cm->current_frame_id - (1 << diff_len))
 4699|  40.9k|            pbi->valid_for_referencing[i] = 0;
 4700|   121k|        }
 4701|   244k|      }
 4702|  34.9k|    }
 4703|       |
 4704|   198k|    frame_size_override_flag = frame_is_sframe(cm) ? 1 : aom_rb_read_bit(rb);
  ------------------
  |  Branch (4704:32): [True: 139, False: 198k]
  ------------------
 4705|       |
 4706|   198k|    current_frame->order_hint = aom_rb_read_literal(
 4707|   198k|        rb, seq_params->order_hint_info.order_hint_bits_minus_1 + 1);
 4708|       |
 4709|   198k|    if (seq_params->order_hint_info.enable_order_hint)
  ------------------
  |  Branch (4709:9): [True: 171k, False: 27.2k]
  ------------------
 4710|   171k|      current_frame->frame_number = current_frame->order_hint;
 4711|       |
 4712|   198k|    if (!features->error_resilient_mode && !frame_is_intra_only(cm)) {
  ------------------
  |  Branch (4712:9): [True: 121k, False: 77.1k]
  |  Branch (4712:44): [True: 72.7k, False: 49.0k]
  ------------------
 4713|  72.7k|      features->primary_ref_frame = aom_rb_read_literal(rb, PRIMARY_REF_BITS);
  ------------------
  |  |   65|  72.7k|#define PRIMARY_REF_BITS 3
  ------------------
 4714|  72.7k|    }
 4715|   198k|  }
 4716|       |
 4717|   240k|  if (seq_params->decoder_model_info_present_flag) {
  ------------------
  |  Branch (4717:7): [True: 19.9k, False: 220k]
  ------------------
 4718|  19.9k|    pbi->buffer_removal_time_present = aom_rb_read_bit(rb);
 4719|  19.9k|    if (pbi->buffer_removal_time_present) {
  ------------------
  |  Branch (4719:9): [True: 4.05k, False: 15.8k]
  ------------------
 4720|  4.05k|      for (int op_num = 0;
 4721|  21.3k|           op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) {
  ------------------
  |  Branch (4721:12): [True: 17.2k, False: 4.05k]
  ------------------
 4722|  17.2k|        if (seq_params->op_params[op_num].decoder_model_param_present_flag) {
  ------------------
  |  Branch (4722:13): [True: 7.03k, False: 10.2k]
  ------------------
 4723|  7.03k|          if (seq_params->operating_point_idc[op_num] == 0 ||
  ------------------
  |  Branch (4723:15): [True: 179, False: 6.85k]
  ------------------
 4724|  7.03k|              (((seq_params->operating_point_idc[op_num] >>
  ------------------
  |  Branch (4724:16): [True: 3.65k, False: 3.20k]
  ------------------
 4725|  6.85k|                 cm->temporal_layer_id) &
 4726|  6.85k|                0x1) &&
 4727|  6.85k|               ((seq_params->operating_point_idc[op_num] >>
  ------------------
  |  Branch (4727:16): [True: 2.25k, False: 1.39k]
  ------------------
 4728|  3.65k|                 (cm->spatial_layer_id + 8)) &
 4729|  3.65k|                0x1))) {
 4730|  2.43k|            cm->buffer_removal_times[op_num] = aom_rb_read_unsigned_literal(
 4731|  2.43k|                rb, seq_params->decoder_model_info.buffer_removal_time_length);
 4732|  4.59k|          } else {
 4733|  4.59k|            cm->buffer_removal_times[op_num] = 0;
 4734|  4.59k|          }
 4735|  10.2k|        } else {
 4736|  10.2k|          cm->buffer_removal_times[op_num] = 0;
 4737|  10.2k|        }
 4738|  17.2k|      }
 4739|  4.05k|    }
 4740|  19.9k|  }
 4741|   240k|  if (current_frame->frame_type == KEY_FRAME) {
  ------------------
  |  Branch (4741:7): [True: 113k, False: 127k]
  ------------------
 4742|   113k|    if (!cm->show_frame) {  // unshown keyframe (forward keyframe)
  ------------------
  |  Branch (4742:9): [True: 47.6k, False: 65.4k]
  ------------------
 4743|  47.6k|      current_frame->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
 4744|  65.4k|    } else {  // shown keyframe
 4745|  65.4k|      current_frame->refresh_frame_flags = (1 << REF_FRAMES) - 1;
 4746|  65.4k|    }
 4747|       |
 4748|   904k|    for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
  ------------------
  |  Branch (4748:21): [True: 791k, False: 113k]
  ------------------
 4749|   791k|      cm->remapped_ref_idx[i] = INVALID_IDX;
  ------------------
  |  |   15|   791k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
 4750|   791k|    }
 4751|   113k|    if (pbi->need_resync) {
  ------------------
  |  Branch (4751:9): [True: 85.3k, False: 27.7k]
  ------------------
 4752|  85.3k|      reset_ref_frame_map(cm);
 4753|  85.3k|      pbi->need_resync = 0;
 4754|  85.3k|    }
 4755|   127k|  } else {
 4756|   127k|    if (current_frame->frame_type == INTRA_ONLY_FRAME) {
  ------------------
  |  Branch (4756:9): [True: 35.8k, False: 91.3k]
  ------------------
 4757|  35.8k|      current_frame->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
 4758|  35.8k|      if (current_frame->refresh_frame_flags == 0xFF) {
  ------------------
  |  Branch (4758:11): [True: 815, False: 35.0k]
  ------------------
 4759|    815|        aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
 4760|    815|                           "Intra only frames cannot have refresh flags 0xFF");
 4761|    815|      }
 4762|  35.8k|      if (pbi->need_resync) {
  ------------------
  |  Branch (4762:11): [True: 27.8k, False: 7.95k]
  ------------------
 4763|  27.8k|        reset_ref_frame_map(cm);
 4764|  27.8k|        pbi->need_resync = 0;
 4765|  27.8k|      }
 4766|  91.3k|    } else if (pbi->need_resync != 1) { /* Skip if need resync */
  ------------------
  |  Branch (4766:16): [True: 60.4k, False: 30.9k]
  ------------------
 4767|  60.4k|      current_frame->refresh_frame_flags =
 4768|  60.4k|          frame_is_sframe(cm) ? 0xFF : aom_rb_read_literal(rb, REF_FRAMES);
  ------------------
  |  Branch (4768:11): [True: 78, False: 60.3k]
  ------------------
 4769|  60.4k|    }
 4770|   127k|  }
 4771|       |
 4772|   240k|  if (!frame_is_intra_only(cm) || current_frame->refresh_frame_flags != 0xFF) {
  ------------------
  |  Branch (4772:7): [True: 92.1k, False: 148k]
  |  Branch (4772:35): [True: 82.4k, False: 65.6k]
  ------------------
 4773|       |    // Read all ref frame order hints if error_resilient_mode == 1
 4774|   165k|    if (features->error_resilient_mode &&
  ------------------
  |  Branch (4774:9): [True: 43.7k, False: 121k]
  ------------------
 4775|   165k|        seq_params->order_hint_info.enable_order_hint) {
  ------------------
  |  Branch (4775:9): [True: 40.8k, False: 2.91k]
  ------------------
 4776|   364k|      for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) {
  ------------------
  |  Branch (4776:29): [True: 323k, False: 40.8k]
  ------------------
 4777|       |        // Read order hint from bit stream
 4778|   323k|        unsigned int order_hint = aom_rb_read_literal(
 4779|   323k|            rb, seq_params->order_hint_info.order_hint_bits_minus_1 + 1);
 4780|       |        // Get buffer
 4781|   323k|        RefCntBuffer *buf = cm->ref_frame_map[ref_idx];
 4782|   323k|        if (buf == NULL || order_hint != buf->order_hint) {
  ------------------
  |  Branch (4782:13): [True: 203k, False: 119k]
  |  Branch (4782:28): [True: 73.4k, False: 46.4k]
  ------------------
 4783|   277k|          if (buf != NULL) {
  ------------------
  |  Branch (4783:15): [True: 73.4k, False: 203k]
  ------------------
 4784|  73.4k|            lock_buffer_pool(pool);
 4785|  73.4k|            decrease_ref_count(buf, pool);
 4786|  73.4k|            unlock_buffer_pool(pool);
 4787|  73.4k|            cm->ref_frame_map[ref_idx] = NULL;
 4788|  73.4k|          }
 4789|       |          // If no corresponding buffer exists, allocate a new buffer with all
 4790|       |          // pixels set to neutral grey.
 4791|   277k|          int buf_idx = get_free_fb(cm);
 4792|   277k|          if (buf_idx == INVALID_IDX) {
  ------------------
  |  |   15|   277k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  |  Branch (4792:15): [True: 0, False: 277k]
  ------------------
 4793|      0|            aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
 4794|      0|                               "Unable to find free frame buffer");
 4795|      0|          }
 4796|   277k|          buf = &frame_bufs[buf_idx];
 4797|   277k|          lock_buffer_pool(pool);
 4798|   277k|#if CONFIG_SIZE_LIMIT
 4799|   277k|          if (seq_params->max_frame_width > DECODE_WIDTH_LIMIT ||
  ------------------
  |  |   78|   554k|#define DECODE_WIDTH_LIMIT 12288
  ------------------
  |  Branch (4799:15): [True: 75, False: 276k]
  ------------------
 4800|   277k|              seq_params->max_frame_height > DECODE_HEIGHT_LIMIT) {
  ------------------
  |  |   77|   276k|#define DECODE_HEIGHT_LIMIT 12288
  ------------------
  |  Branch (4800:15): [True: 70, False: 276k]
  ------------------
 4801|    145|            decrease_ref_count(buf, pool);
 4802|    145|            unlock_buffer_pool(pool);
 4803|    145|            aom_internal_error(
 4804|    145|                cm->error, AOM_CODEC_CORRUPT_FRAME,
 4805|    145|                "Dimensions of %dx%d beyond allowed size of %dx%d.",
 4806|    145|                seq_params->max_frame_width, seq_params->max_frame_height,
 4807|    145|                DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT);
  ------------------
  |  |   78|    145|#define DECODE_WIDTH_LIMIT 12288
  ------------------
                              DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT);
  ------------------
  |  |   77|    145|#define DECODE_HEIGHT_LIMIT 12288
  ------------------
 4808|    145|          }
 4809|   277k|#endif
 4810|   277k|          if (aom_realloc_frame_buffer(
  ------------------
  |  Branch (4810:15): [True: 322, False: 276k]
  ------------------
 4811|   277k|                  &buf->buf, seq_params->max_frame_width,
 4812|   277k|                  seq_params->max_frame_height, seq_params->subsampling_x,
 4813|   277k|                  seq_params->subsampling_y, seq_params->use_highbitdepth,
 4814|   277k|                  AOM_BORDER_IN_PIXELS, features->byte_alignment,
  ------------------
  |  |   32|   277k|#define AOM_BORDER_IN_PIXELS 288
  ------------------
 4815|   277k|                  &buf->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv, false,
 4816|   277k|                  0)) {
 4817|    322|            decrease_ref_count(buf, pool);
 4818|    322|            unlock_buffer_pool(pool);
 4819|    322|            aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
 4820|    322|                               "Failed to allocate frame buffer");
 4821|    322|          }
 4822|   277k|          unlock_buffer_pool(pool);
 4823|       |          // According to the specification, valid bitstreams are required to
 4824|       |          // never use missing reference frames so the filling process for
 4825|       |          // missing frames is not normatively defined and RefValid for missing
 4826|       |          // frames is set to 0.
 4827|       |
 4828|       |          // To make libaom more robust when the bitstream has been corrupted
 4829|       |          // by the loss of some frames of data, this code adds a neutral grey
 4830|       |          // buffer in place of missing frames, i.e.
 4831|       |          //
 4832|   277k|          set_planes_to_neutral_grey(seq_params, &buf->buf, 0);
 4833|       |          //
 4834|       |          // and allows the frames to be used for referencing, i.e.
 4835|       |          //
 4836|   277k|          pbi->valid_for_referencing[ref_idx] = 1;
 4837|       |          //
 4838|       |          // Please note such behavior is not normative and other decoders may
 4839|       |          // use a different approach.
 4840|   277k|          cm->ref_frame_map[ref_idx] = buf;
 4841|   277k|          buf->order_hint = order_hint;
 4842|   277k|        }
 4843|   323k|      }
 4844|  40.8k|    }
 4845|   165k|  }
 4846|       |
 4847|   240k|  if (current_frame->frame_type == KEY_FRAME) {
  ------------------
  |  Branch (4847:7): [True: 113k, False: 127k]
  ------------------
 4848|   113k|    setup_frame_size(cm, frame_size_override_flag, rb);
 4849|       |
 4850|   113k|    if (features->allow_screen_content_tools && !av1_superres_scaled(cm))
  ------------------
  |  Branch (4850:9): [True: 66.7k, False: 46.2k]
  |  Branch (4850:49): [True: 56.8k, False: 9.87k]
  ------------------
 4851|  56.8k|      features->allow_intrabc = aom_rb_read_bit(rb);
 4852|   113k|    features->allow_ref_frame_mvs = 0;
 4853|   113k|    cm->prev_frame = NULL;
 4854|   127k|  } else {
 4855|   127k|    features->allow_ref_frame_mvs = 0;
 4856|       |
 4857|   127k|    if (current_frame->frame_type == INTRA_ONLY_FRAME) {
  ------------------
  |  Branch (4857:9): [True: 34.5k, False: 92.6k]
  ------------------
 4858|  34.5k|      cm->cur_frame->film_grain_params_present =
 4859|  34.5k|          seq_params->film_grain_params_present;
 4860|  34.5k|      setup_frame_size(cm, frame_size_override_flag, rb);
 4861|  34.5k|      if (features->allow_screen_content_tools && !av1_superres_scaled(cm))
  ------------------
  |  Branch (4861:11): [True: 28.4k, False: 6.15k]
  |  Branch (4861:51): [True: 27.4k, False: 976]
  ------------------
 4862|  27.4k|        features->allow_intrabc = aom_rb_read_bit(rb);
 4863|       |
 4864|  92.6k|    } else if (pbi->need_resync != 1) { /* Skip if need resync */
  ------------------
  |  Branch (4864:16): [True: 60.4k, False: 32.2k]
  ------------------
 4865|  60.4k|      int frame_refs_short_signaling = 0;
 4866|       |      // Frame refs short signaling is off when error resilient mode is on.
 4867|  60.4k|      if (seq_params->order_hint_info.enable_order_hint)
  ------------------
  |  Branch (4867:11): [True: 59.9k, False: 421]
  ------------------
 4868|  59.9k|        frame_refs_short_signaling = aom_rb_read_bit(rb);
 4869|       |
 4870|  60.4k|      if (frame_refs_short_signaling) {
  ------------------
  |  Branch (4870:11): [True: 25.6k, False: 34.7k]
  ------------------
 4871|       |        // == LAST_FRAME ==
 4872|  25.6k|        const int lst_ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
  ------------------
  |  |  554|  25.6k|#define REF_FRAMES_LOG2 3
  ------------------
 4873|  25.6k|        const RefCntBuffer *const lst_buf = cm->ref_frame_map[lst_ref];
 4874|       |
 4875|       |        // == GOLDEN_FRAME ==
 4876|  25.6k|        const int gld_ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
  ------------------
  |  |  554|  25.6k|#define REF_FRAMES_LOG2 3
  ------------------
 4877|  25.6k|        const RefCntBuffer *const gld_buf = cm->ref_frame_map[gld_ref];
 4878|       |
 4879|       |        // Most of the time, streams start with a keyframe. In that case,
 4880|       |        // ref_frame_map will have been filled in at that point and will not
 4881|       |        // contain any NULLs. However, streams are explicitly allowed to start
 4882|       |        // with an intra-only frame, so long as they don't then signal a
 4883|       |        // reference to a slot that hasn't been set yet. That's what we are
 4884|       |        // checking here.
 4885|  25.6k|        if (lst_buf == NULL)
  ------------------
  |  Branch (4885:13): [True: 86, False: 25.5k]
  ------------------
 4886|     86|          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 4887|     86|                             "Inter frame requests nonexistent reference");
 4888|  25.6k|        if (gld_buf == NULL)
  ------------------
  |  Branch (4888:13): [True: 23, False: 25.6k]
  ------------------
 4889|     23|          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 4890|     23|                             "Inter frame requests nonexistent reference");
 4891|       |
 4892|  25.6k|        av1_set_frame_refs(cm, cm->remapped_ref_idx, lst_ref, gld_ref);
 4893|  25.6k|      }
 4894|       |
 4895|   475k|      for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
  ------------------
  |  Branch (4895:23): [True: 415k, False: 60.4k]
  ------------------
 4896|   415k|        int ref = 0;
 4897|   415k|        if (!frame_refs_short_signaling) {
  ------------------
  |  Branch (4897:13): [True: 240k, False: 175k]
  ------------------
 4898|   240k|          ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
  ------------------
  |  |  554|   240k|#define REF_FRAMES_LOG2 3
  ------------------
 4899|       |
 4900|       |          // Most of the time, streams start with a keyframe. In that case,
 4901|       |          // ref_frame_map will have been filled in at that point and will not
 4902|       |          // contain any NULLs. However, streams are explicitly allowed to start
 4903|       |          // with an intra-only frame, so long as they don't then signal a
 4904|       |          // reference to a slot that hasn't been set yet. That's what we are
 4905|       |          // checking here.
 4906|   240k|          if (cm->ref_frame_map[ref] == NULL)
  ------------------
  |  Branch (4906:15): [True: 460, False: 239k]
  ------------------
 4907|    460|            aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 4908|    460|                               "Inter frame requests nonexistent reference");
 4909|   240k|          cm->remapped_ref_idx[i] = ref;
 4910|   240k|        } else {
 4911|   175k|          ref = cm->remapped_ref_idx[i];
 4912|   175k|        }
 4913|       |        // Check valid for referencing
 4914|   415k|        if (pbi->valid_for_referencing[ref] == 0)
  ------------------
  |  Branch (4914:13): [True: 62, False: 415k]
  ------------------
 4915|     62|          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 4916|     62|                             "Reference frame not valid for referencing");
 4917|       |
 4918|   415k|        cm->ref_frame_sign_bias[LAST_FRAME + i] = 0;
 4919|       |
 4920|   415k|        if (seq_params->frame_id_numbers_present_flag) {
  ------------------
  |  Branch (4920:13): [True: 119, False: 415k]
  ------------------
 4921|    119|          int frame_id_length = seq_params->frame_id_length;
 4922|    119|          int diff_len = seq_params->delta_frame_id_length;
 4923|    119|          int delta_frame_id_minus_1 = aom_rb_read_literal(rb, diff_len);
 4924|    119|          int ref_frame_id =
 4925|    119|              ((cm->current_frame_id - (delta_frame_id_minus_1 + 1) +
 4926|    119|                (1 << frame_id_length)) %
 4927|    119|               (1 << frame_id_length));
 4928|       |          // Compare values derived from delta_frame_id_minus_1 and
 4929|       |          // refresh_frame_flags.
 4930|    119|          if (ref_frame_id != cm->ref_frame_id[ref])
  ------------------
  |  Branch (4930:15): [True: 105, False: 14]
  ------------------
 4931|    105|            aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 4932|    105|                               "Reference buffer frame ID mismatch");
 4933|    119|        }
 4934|   415k|      }
 4935|       |
 4936|  60.4k|      if (!features->error_resilient_mode && frame_size_override_flag) {
  ------------------
  |  Branch (4936:11): [True: 51.9k, False: 8.48k]
  |  Branch (4936:46): [True: 38.8k, False: 13.1k]
  ------------------
 4937|  38.8k|        setup_frame_size_with_refs(cm, rb);
 4938|  38.8k|      } else {
 4939|  21.5k|        setup_frame_size(cm, frame_size_override_flag, rb);
 4940|  21.5k|      }
 4941|       |
 4942|  60.4k|      if (features->cur_frame_force_integer_mv) {
  ------------------
  |  Branch (4942:11): [True: 7.42k, False: 52.9k]
  ------------------
 4943|  7.42k|        features->allow_high_precision_mv = 0;
 4944|  52.9k|      } else {
 4945|  52.9k|        features->allow_high_precision_mv = aom_rb_read_bit(rb);
 4946|  52.9k|      }
 4947|  60.4k|      features->interp_filter = read_frame_interp_filter(rb);
 4948|  60.4k|      features->switchable_motion_mode = aom_rb_read_bit(rb);
 4949|  60.4k|    }
 4950|       |
 4951|   127k|    cm->prev_frame = get_primary_ref_frame_buf(cm);
 4952|   127k|    if (features->primary_ref_frame != PRIMARY_REF_NONE &&
  ------------------
  |  |   66|   254k|#define PRIMARY_REF_NONE 7
  ------------------
  |  Branch (4952:9): [True: 64.5k, False: 62.6k]
  ------------------
 4953|   127k|        get_primary_ref_frame_buf(cm) == NULL) {
  ------------------
  |  Branch (4953:9): [True: 10.1k, False: 54.3k]
  ------------------
 4954|  10.1k|      aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 4955|  10.1k|                         "Reference frame containing this frame's initial "
 4956|  10.1k|                         "frame context is unavailable.");
 4957|  10.1k|    }
 4958|       |
 4959|   127k|    if (!(current_frame->frame_type == INTRA_ONLY_FRAME) &&
  ------------------
  |  Branch (4959:9): [True: 65.4k, False: 61.7k]
  ------------------
 4960|   127k|        pbi->need_resync != 1) {
  ------------------
  |  Branch (4960:9): [True: 53.0k, False: 12.3k]
  ------------------
 4961|  53.0k|      if (frame_might_allow_ref_frame_mvs(cm))
  ------------------
  |  Branch (4961:11): [True: 44.8k, False: 8.19k]
  ------------------
 4962|  44.8k|        features->allow_ref_frame_mvs = aom_rb_read_bit(rb);
 4963|  8.19k|      else
 4964|  8.19k|        features->allow_ref_frame_mvs = 0;
 4965|       |
 4966|   396k|      for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
  ------------------
  |  Branch (4966:32): [True: 342k, False: 53.0k]
  ------------------
 4967|   342k|        const RefCntBuffer *const ref_buf = get_ref_frame_buf(cm, i);
 4968|   342k|        struct scale_factors *const ref_scale_factors =
 4969|   342k|            get_ref_scale_factors(cm, i);
 4970|   342k|        av1_setup_scale_factors_for_frame(
 4971|   342k|            ref_scale_factors, ref_buf->buf.y_crop_width,
 4972|   342k|            ref_buf->buf.y_crop_height, cm->width, cm->height);
 4973|   342k|        if ((!av1_is_valid_scale(ref_scale_factors)))
  ------------------
  |  Branch (4973:13): [True: 6.81k, False: 336k]
  ------------------
 4974|  6.81k|          aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
 4975|  6.81k|                             "Reference frame has invalid dimensions");
 4976|   342k|      }
 4977|  53.0k|    }
 4978|   127k|  }
 4979|       |
 4980|   240k|  av1_setup_frame_buf_refs(cm);
 4981|       |
 4982|   240k|  av1_setup_frame_sign_bias(cm);
 4983|       |
 4984|   240k|  cm->cur_frame->frame_type = current_frame->frame_type;
 4985|       |
 4986|   240k|  update_ref_frame_id(pbi);
 4987|       |
 4988|   240k|  const int might_bwd_adapt = !(seq_params->reduced_still_picture_hdr) &&
  ------------------
  |  Branch (4988:31): [True: 165k, False: 74.3k]
  ------------------
 4989|   240k|                              !(features->disable_cdf_update);
  ------------------
  |  Branch (4989:31): [True: 136k, False: 29.1k]
  ------------------
 4990|   240k|  if (might_bwd_adapt) {
  ------------------
  |  Branch (4990:7): [True: 136k, False: 103k]
  ------------------
 4991|   136k|    features->refresh_frame_context = aom_rb_read_bit(rb)
  ------------------
  |  Branch (4991:39): [True: 59.1k, False: 77.6k]
  ------------------
 4992|   136k|                                          ? REFRESH_FRAME_CONTEXT_DISABLED
 4993|   136k|                                          : REFRESH_FRAME_CONTEXT_BACKWARD;
 4994|   136k|  } else {
 4995|   103k|    features->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
 4996|   103k|  }
 4997|       |
 4998|   240k|  cm->cur_frame->buf.bit_depth = seq_params->bit_depth;
 4999|   240k|  cm->cur_frame->buf.color_primaries = seq_params->color_primaries;
 5000|   240k|  cm->cur_frame->buf.transfer_characteristics =
 5001|   240k|      seq_params->transfer_characteristics;
 5002|   240k|  cm->cur_frame->buf.matrix_coefficients = seq_params->matrix_coefficients;
 5003|   240k|  cm->cur_frame->buf.monochrome = seq_params->monochrome;
 5004|   240k|  cm->cur_frame->buf.chroma_sample_position =
 5005|   240k|      seq_params->chroma_sample_position;
 5006|   240k|  cm->cur_frame->buf.color_range = seq_params->color_range;
 5007|   240k|  cm->cur_frame->buf.render_width = cm->render_width;
 5008|   240k|  cm->cur_frame->buf.render_height = cm->render_height;
 5009|       |
 5010|   240k|  if (pbi->need_resync) {
  ------------------
  |  Branch (5010:7): [True: 12.3k, False: 227k]
  ------------------
 5011|  12.3k|    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 5012|  12.3k|                       "Keyframe / intra-only frame required to reset decoder"
 5013|  12.3k|                       " state");
 5014|  12.3k|  }
 5015|       |
 5016|   240k|  if (features->allow_intrabc) {
  ------------------
  |  Branch (5016:7): [True: 44.6k, False: 195k]
  ------------------
 5017|       |    // Set parameters corresponding to no filtering.
 5018|  44.6k|    struct loopfilter *lf = &cm->lf;
 5019|  44.6k|    lf->filter_level[0] = 0;
 5020|  44.6k|    lf->filter_level[1] = 0;
 5021|  44.6k|    cm->cdef_info.cdef_bits = 0;
 5022|  44.6k|    cm->cdef_info.cdef_strengths[0] = 0;
 5023|  44.6k|    cm->cdef_info.nb_cdef_strengths = 1;
 5024|  44.6k|    cm->cdef_info.cdef_uv_strengths[0] = 0;
 5025|  44.6k|    cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
 5026|  44.6k|    cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
 5027|  44.6k|    cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
 5028|  44.6k|  }
 5029|       |
 5030|   240k|  read_tile_info(pbi, rb);
 5031|   240k|  if (!av1_is_min_tile_width_satisfied(cm)) {
  ------------------
  |  Branch (5031:7): [True: 107, False: 240k]
  ------------------
 5032|    107|    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 5033|    107|                       "Minimum tile width requirement not satisfied");
 5034|    107|  }
 5035|       |
 5036|   240k|  CommonQuantParams *const quant_params = &cm->quant_params;
 5037|   240k|  setup_quantization(quant_params, av1_num_planes(cm),
 5038|   240k|                     cm->seq_params->separate_uv_delta_q, rb);
 5039|   240k|  xd->bd = (int)seq_params->bit_depth;
 5040|       |
 5041|   240k|  CommonContexts *const above_contexts = &cm->above_contexts;
 5042|   240k|  if (above_contexts->num_planes < av1_num_planes(cm) ||
  ------------------
  |  Branch (5042:7): [True: 66.1k, False: 174k]
  ------------------
 5043|   240k|      above_contexts->num_mi_cols < cm->mi_params.mi_cols ||
  ------------------
  |  Branch (5043:7): [True: 3.34k, False: 170k]
  ------------------
 5044|   240k|      above_contexts->num_tile_rows < cm->tiles.rows) {
  ------------------
  |  Branch (5044:7): [True: 1.28k, False: 169k]
  ------------------
 5045|  19.2k|    av1_free_above_context_buffers(above_contexts);
 5046|  19.2k|    if (av1_alloc_above_context_buffers(above_contexts, cm->tiles.rows,
  ------------------
  |  Branch (5046:9): [True: 0, False: 19.2k]
  ------------------
 5047|  19.2k|                                        cm->mi_params.mi_cols,
 5048|  19.2k|                                        av1_num_planes(cm))) {
 5049|      0|      aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
 5050|      0|                         "Failed to allocate context buffers");
 5051|      0|    }
 5052|  19.2k|  }
 5053|       |
 5054|   240k|  if (features->primary_ref_frame == PRIMARY_REF_NONE) {
  ------------------
  |  |   66|   240k|#define PRIMARY_REF_NONE 7
  ------------------
  |  Branch (5054:7): [True: 150k, False: 90.0k]
  ------------------
 5055|   150k|    av1_setup_past_independence(cm);
 5056|   150k|  }
 5057|       |
 5058|   240k|  setup_segmentation(cm, rb);
 5059|       |
 5060|   240k|  cm->delta_q_info.delta_q_res = 1;
 5061|   240k|  cm->delta_q_info.delta_lf_res = 1;
 5062|   240k|  cm->delta_q_info.delta_lf_present_flag = 0;
 5063|   240k|  cm->delta_q_info.delta_lf_multi = 0;
 5064|   240k|  cm->delta_q_info.delta_q_present_flag =
 5065|   240k|      quant_params->base_qindex > 0 ? aom_rb_read_bit(rb) : 0;
  ------------------
  |  Branch (5065:7): [True: 146k, False: 93.4k]
  ------------------
 5066|   240k|  if (cm->delta_q_info.delta_q_present_flag) {
  ------------------
  |  Branch (5066:7): [True: 20.3k, False: 219k]
  ------------------
 5067|  20.3k|    xd->current_base_qindex = quant_params->base_qindex;
 5068|  20.3k|    cm->delta_q_info.delta_q_res = 1 << aom_rb_read_literal(rb, 2);
 5069|  20.3k|    if (!features->allow_intrabc)
  ------------------
  |  Branch (5069:9): [True: 16.5k, False: 3.82k]
  ------------------
 5070|  16.5k|      cm->delta_q_info.delta_lf_present_flag = aom_rb_read_bit(rb);
 5071|  20.3k|    if (cm->delta_q_info.delta_lf_present_flag) {
  ------------------
  |  Branch (5071:9): [True: 6.98k, False: 13.4k]
  ------------------
 5072|  6.98k|      cm->delta_q_info.delta_lf_res = 1 << aom_rb_read_literal(rb, 2);
 5073|  6.98k|      cm->delta_q_info.delta_lf_multi = aom_rb_read_bit(rb);
 5074|  6.98k|      av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
 5075|  6.98k|    }
 5076|  20.3k|  }
 5077|       |
 5078|   240k|  xd->cur_frame_force_integer_mv = features->cur_frame_force_integer_mv;
 5079|       |
 5080|  1.73M|  for (int i = 0; i < MAX_SEGMENTS; ++i) {
  ------------------
  |  |   21|  1.73M|#define MAX_SEGMENTS 8
  ------------------
  |  Branch (5080:19): [True: 1.49M, False: 240k]
  ------------------
 5081|  1.49M|    const int qindex = av1_get_qindex(&cm->seg, i, quant_params->base_qindex);
 5082|  1.49M|    xd->lossless[i] =
 5083|  1.49M|        qindex == 0 && quant_params->y_dc_delta_q == 0 &&
  ------------------
  |  Branch (5083:9): [True: 327k, False: 1.16M]
  |  Branch (5083:24): [True: 308k, False: 18.6k]
  ------------------
 5084|  1.49M|        quant_params->u_dc_delta_q == 0 && quant_params->u_ac_delta_q == 0 &&
  ------------------
  |  Branch (5084:9): [True: 297k, False: 10.5k]
  |  Branch (5084:44): [True: 293k, False: 4.65k]
  ------------------
 5085|  1.49M|        quant_params->v_dc_delta_q == 0 && quant_params->v_ac_delta_q == 0;
  ------------------
  |  Branch (5085:9): [True: 292k, False: 411]
  |  Branch (5085:44): [True: 292k, False: 105]
  ------------------
 5086|  1.49M|    xd->qindex[i] = qindex;
 5087|  1.49M|  }
 5088|   240k|  features->coded_lossless = is_coded_lossless(cm, xd);
 5089|   240k|  features->all_lossless = features->coded_lossless && !av1_superres_scaled(cm);
  ------------------
  |  Branch (5089:28): [True: 35.8k, False: 204k]
  |  Branch (5089:56): [True: 35.3k, False: 521]
  ------------------
 5090|   240k|  setup_segmentation_dequant(cm, xd);
 5091|   240k|  if (features->coded_lossless) {
  ------------------
  |  Branch (5091:7): [True: 35.8k, False: 204k]
  ------------------
 5092|  35.8k|    cm->lf.filter_level[0] = 0;
 5093|  35.8k|    cm->lf.filter_level[1] = 0;
 5094|  35.8k|  }
 5095|   240k|  if (features->coded_lossless || !seq_params->enable_cdef) {
  ------------------
  |  Branch (5095:7): [True: 89.7k, False: 150k]
  |  Branch (5095:35): [True: 78.2k, False: 72.2k]
  ------------------
 5096|   114k|    cm->cdef_info.cdef_bits = 0;
 5097|   114k|    cm->cdef_info.cdef_strengths[0] = 0;
 5098|   114k|    cm->cdef_info.cdef_uv_strengths[0] = 0;
 5099|   114k|  }
 5100|   240k|  if (features->all_lossless || !seq_params->enable_restoration) {
  ------------------
  |  Branch (5100:7): [True: 89.2k, False: 151k]
  |  Branch (5100:33): [True: 65.6k, False: 85.3k]
  ------------------
 5101|   101k|    cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
 5102|   101k|    cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
 5103|   101k|    cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
 5104|   101k|  }
 5105|   240k|  setup_loopfilter(cm, rb);
 5106|       |
 5107|   240k|  if (!features->coded_lossless && seq_params->enable_cdef) {
  ------------------
  |  Branch (5107:7): [True: 148k, False: 91.7k]
  |  Branch (5107:36): [True: 71.3k, False: 77.1k]
  ------------------
 5108|  71.3k|    setup_cdef(cm, rb);
 5109|  71.3k|  }
 5110|   240k|  if (!features->all_lossless && seq_params->enable_restoration) {
  ------------------
  |  Branch (5110:7): [True: 146k, False: 93.4k]
  |  Branch (5110:34): [True: 81.8k, False: 64.9k]
  ------------------
 5111|  81.8k|    decode_restoration_mode(cm, rb);
 5112|  81.8k|  }
 5113|       |
 5114|   240k|  features->tx_mode = read_tx_mode(rb, features->coded_lossless);
 5115|   240k|  current_frame->reference_mode = read_frame_reference_mode(cm, rb);
 5116|       |
 5117|   240k|  av1_setup_skip_mode_allowed(cm);
 5118|   240k|  current_frame->skip_mode_info.skip_mode_flag =
 5119|   240k|      current_frame->skip_mode_info.skip_mode_allowed ? aom_rb_read_bit(rb) : 0;
  ------------------
  |  Branch (5119:7): [True: 23.4k, False: 216k]
  ------------------
 5120|       |
 5121|   240k|  if (frame_might_allow_warped_motion(cm))
  ------------------
  |  Branch (5121:7): [True: 32.4k, False: 207k]
  ------------------
 5122|  32.4k|    features->allow_warped_motion = aom_rb_read_bit(rb);
 5123|   207k|  else
 5124|   207k|    features->allow_warped_motion = 0;
 5125|       |
 5126|   240k|  features->reduced_tx_set_used = aom_rb_read_bit(rb);
 5127|       |
 5128|   240k|  if (features->allow_ref_frame_mvs && !frame_might_allow_ref_frame_mvs(cm)) {
  ------------------
  |  Branch (5128:7): [True: 16.8k, False: 223k]
  |  Branch (5128:40): [True: 0, False: 16.8k]
  ------------------
 5129|      0|    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 5130|      0|                       "Frame wrongly requests reference frame MVs");
 5131|      0|  }
 5132|       |
 5133|   240k|  if (!frame_is_intra_only(cm)) read_global_motion(cm, rb);
  ------------------
  |  Branch (5133:7): [True: 46.0k, False: 194k]
  ------------------
 5134|       |
 5135|   240k|  cm->cur_frame->film_grain_params_present =
 5136|   240k|      seq_params->film_grain_params_present;
 5137|   240k|  read_film_grain(cm, rb);
 5138|       |
 5139|   240k|#if EXT_TILE_DEBUG
 5140|   240k|  if (pbi->ext_tile_debug && cm->tiles.large_scale) {
  ------------------
  |  Branch (5140:7): [True: 46.3k, False: 193k]
  |  Branch (5140:30): [True: 21.3k, False: 24.9k]
  ------------------
 5141|  21.3k|    read_ext_tile_info(pbi, rb);
 5142|  21.3k|    av1_set_single_tile_decoding_mode(cm);
 5143|  21.3k|  }
 5144|   240k|#endif  // EXT_TILE_DEBUG
 5145|   240k|  return 0;
 5146|   251k|}
decodeframe.c:reset_frame_buffers:
 4464|  27.5k|static inline void reset_frame_buffers(AV1_COMMON *cm) {
 4465|  27.5k|  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
 4466|  27.5k|  int i;
 4467|       |
 4468|  27.5k|  lock_buffer_pool(cm->buffer_pool);
 4469|  27.5k|  reset_ref_frame_map(cm);
 4470|  27.5k|  assert(cm->cur_frame->ref_count == 1);
 4471|   468k|  for (i = 0; i < cm->buffer_pool->num_frame_bufs; ++i) {
  ------------------
  |  Branch (4471:15): [True: 441k, False: 27.5k]
  ------------------
 4472|       |    // Reset all unreferenced frame buffers. We can also reset cm->cur_frame
 4473|       |    // because we are the sole owner of cm->cur_frame.
 4474|   441k|    if (frame_bufs[i].ref_count > 0 && &frame_bufs[i] != cm->cur_frame) {
  ------------------
  |  Branch (4474:9): [True: 32.6k, False: 408k]
  |  Branch (4474:40): [True: 5.04k, False: 27.5k]
  ------------------
 4475|  5.04k|      continue;
 4476|  5.04k|    }
 4477|   436k|    frame_bufs[i].order_hint = 0;
 4478|   436k|    av1_zero(frame_bufs[i].ref_order_hints);
  ------------------
  |  |   43|   436k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
 4479|   436k|  }
 4480|  27.5k|  av1_zero_unused_internal_frame_buffers(&cm->buffer_pool->int_frame_buffers);
 4481|  27.5k|  unlock_buffer_pool(cm->buffer_pool);
 4482|  27.5k|}
decodeframe.c:read_temporal_point_info:
 4211|  2.18k|                                            struct aom_read_bit_buffer *rb) {
 4212|  2.18k|  cm->frame_presentation_time = aom_rb_read_unsigned_literal(
 4213|  2.18k|      rb, cm->seq_params->decoder_model_info.frame_presentation_time_length);
 4214|  2.18k|}
decodeframe.c:show_existing_frame_reset:
 4438|    176|                                             int existing_frame_idx) {
 4439|    176|  AV1_COMMON *const cm = &pbi->common;
 4440|       |
 4441|    176|  assert(cm->show_existing_frame);
 4442|       |
 4443|    176|  cm->current_frame.frame_type = KEY_FRAME;
 4444|       |
 4445|    176|  cm->current_frame.refresh_frame_flags = (1 << REF_FRAMES) - 1;
 4446|       |
 4447|  1.40k|  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
  ------------------
  |  Branch (4447:19): [True: 1.23k, False: 176]
  ------------------
 4448|  1.23k|    cm->remapped_ref_idx[i] = INVALID_IDX;
  ------------------
  |  |   15|  1.23k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
 4449|  1.23k|  }
 4450|       |
 4451|    176|  if (pbi->need_resync) {
  ------------------
  |  Branch (4451:7): [True: 105, False: 71]
  ------------------
 4452|    105|    reset_ref_frame_map(cm);
 4453|    105|    pbi->need_resync = 0;
 4454|    105|  }
 4455|       |
 4456|       |  // Note that the displayed frame must be valid for referencing in order to
 4457|       |  // have been selected.
 4458|    176|  cm->current_frame_id = cm->ref_frame_id[existing_frame_idx];
 4459|    176|  update_ref_frame_id(pbi);
 4460|       |
 4461|    176|  cm->features.refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
 4462|    176|}
decodeframe.c:reset_ref_frame_map:
 4415|   140k|static inline void reset_ref_frame_map(AV1_COMMON *const cm) {
 4416|   140k|  BufferPool *const pool = cm->buffer_pool;
 4417|       |
 4418|  1.26M|  for (int i = 0; i < REF_FRAMES; i++) {
  ------------------
  |  Branch (4418:19): [True: 1.12M, False: 140k]
  ------------------
 4419|  1.12M|    decrease_ref_count(cm->ref_frame_map[i], pool);
 4420|  1.12M|    cm->ref_frame_map[i] = NULL;
 4421|  1.12M|  }
 4422|   140k|}
decodeframe.c:setup_frame_size:
 1980|   168k|                                    struct aom_read_bit_buffer *rb) {
 1981|   168k|  const SequenceHeader *const seq_params = cm->seq_params;
 1982|   168k|  int width, height;
 1983|       |
 1984|   168k|  if (frame_size_override_flag) {
  ------------------
  |  Branch (1984:7): [True: 68.8k, False: 99.1k]
  ------------------
 1985|  68.8k|    int num_bits_width = seq_params->num_bits_width;
 1986|  68.8k|    int num_bits_height = seq_params->num_bits_height;
 1987|  68.8k|    read_frame_size(rb, num_bits_width, num_bits_height, &width, &height);
 1988|  68.8k|    if (width > seq_params->max_frame_width ||
  ------------------
  |  Branch (1988:9): [True: 1.92k, False: 66.9k]
  ------------------
 1989|  68.8k|        height > seq_params->max_frame_height) {
  ------------------
  |  Branch (1989:9): [True: 1.01k, False: 65.9k]
  ------------------
 1990|  2.94k|      aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
 1991|  2.94k|                         "Frame dimensions are larger than the maximum values");
 1992|  2.94k|    }
 1993|  99.1k|  } else {
 1994|  99.1k|    width = seq_params->max_frame_width;
 1995|  99.1k|    height = seq_params->max_frame_height;
 1996|  99.1k|  }
 1997|       |
 1998|   168k|  setup_superres(cm, rb, &width, &height);
 1999|   168k|  resize_context_buffers(cm, width, height);
 2000|   168k|  setup_render_size(cm, rb);
 2001|   168k|  setup_buffer_pool(cm);
 2002|   168k|}
decodeframe.c:read_frame_size:
 1873|   111k|                            int num_bits_height, int *width, int *height) {
 1874|   111k|  *width = aom_rb_read_literal(rb, num_bits_width) + 1;
 1875|   111k|  *height = aom_rb_read_literal(rb, num_bits_height) + 1;
 1876|   111k|}
decodeframe.c:setup_superres:
 1889|   203k|                                  int *height) {
 1890|   203k|  cm->superres_upscaled_width = *width;
 1891|   203k|  cm->superres_upscaled_height = *height;
 1892|       |
 1893|   203k|  const SequenceHeader *const seq_params = cm->seq_params;
 1894|   203k|  if (!seq_params->enable_superres) return;
  ------------------
  |  Branch (1894:7): [True: 122k, False: 81.7k]
  ------------------
 1895|       |
 1896|  81.7k|  if (aom_rb_read_bit(rb)) {
  ------------------
  |  Branch (1896:7): [True: 37.8k, False: 43.9k]
  ------------------
 1897|  37.8k|    cm->superres_scale_denominator =
 1898|  37.8k|        (uint8_t)aom_rb_read_literal(rb, SUPERRES_SCALE_BITS);
  ------------------
  |  |  638|  37.8k|#define SUPERRES_SCALE_BITS 3
  ------------------
 1899|  37.8k|    cm->superres_scale_denominator += SUPERRES_SCALE_DENOMINATOR_MIN;
  ------------------
  |  |  639|  37.8k|#define SUPERRES_SCALE_DENOMINATOR_MIN (SCALE_NUMERATOR + 1)
  |  |  ------------------
  |  |  |  |   22|  37.8k|#define SCALE_NUMERATOR 8
  |  |  ------------------
  ------------------
 1900|       |    // Don't edit cm->width or cm->height directly, or the buffers won't get
 1901|       |    // resized correctly
 1902|  37.8k|    av1_calculate_scaled_superres_size(width, height,
 1903|  37.8k|                                       cm->superres_scale_denominator);
 1904|  43.9k|  } else {
 1905|       |    // 1:1 scaling - ie. no scaling, scale not provided
 1906|  43.9k|    cm->superres_scale_denominator = SCALE_NUMERATOR;
  ------------------
  |  |   22|  43.9k|#define SCALE_NUMERATOR 8
  ------------------
 1907|  43.9k|  }
 1908|  81.7k|}
decodeframe.c:resize_context_buffers:
 1911|   203k|                                          int height) {
 1912|   203k|#if CONFIG_SIZE_LIMIT
 1913|   203k|  if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT)
  ------------------
  |  |   78|   407k|#define DECODE_WIDTH_LIMIT 12288
  ------------------
                if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT)
  ------------------
  |  |   77|   203k|#define DECODE_HEIGHT_LIMIT 12288
  ------------------
  |  Branch (1913:7): [True: 58, False: 203k]
  |  Branch (1913:37): [True: 73, False: 203k]
  ------------------
 1914|    131|    aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
 1915|    131|                       "Dimensions of %dx%d beyond allowed size of %dx%d.",
 1916|    131|                       width, height, DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT);
  ------------------
  |  |   78|    131|#define DECODE_WIDTH_LIMIT 12288
  ------------------
                                     width, height, DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT);
  ------------------
  |  |   77|    131|#define DECODE_HEIGHT_LIMIT 12288
  ------------------
 1917|   203k|#endif
 1918|   203k|  if (cm->width != width || cm->height != height) {
  ------------------
  |  Branch (1918:7): [True: 83.0k, False: 120k]
  |  Branch (1918:29): [True: 12.7k, False: 108k]
  ------------------
 1919|  95.7k|    const int new_mi_rows = CEIL_POWER_OF_TWO(height, MI_SIZE_LOG2);
  ------------------
  |  |   62|  95.7k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
 1920|  95.7k|    const int new_mi_cols = CEIL_POWER_OF_TWO(width, MI_SIZE_LOG2);
  ------------------
  |  |   62|  95.7k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
 1921|       |
 1922|       |    // Allocations in av1_alloc_context_buffers() depend on individual
 1923|       |    // dimensions as well as the overall size.
 1924|  95.7k|    if (new_mi_cols > cm->mi_params.mi_cols ||
  ------------------
  |  Branch (1924:9): [True: 39.3k, False: 56.3k]
  ------------------
 1925|  95.7k|        new_mi_rows > cm->mi_params.mi_rows) {
  ------------------
  |  Branch (1925:9): [True: 16.8k, False: 39.5k]
  ------------------
 1926|  56.1k|      if (av1_alloc_context_buffers(cm, width, height, BLOCK_4X4)) {
  ------------------
  |  Branch (1926:11): [True: 262, False: 55.8k]
  ------------------
 1927|       |        // The cm->mi_* values have been cleared and any existing context
 1928|       |        // buffers have been freed. Clear cm->width and cm->height to be
 1929|       |        // consistent and to force a realloc next time.
 1930|    262|        cm->width = 0;
 1931|    262|        cm->height = 0;
 1932|    262|        aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
 1933|    262|                           "Failed to allocate context buffers");
 1934|    262|      }
 1935|  56.1k|    } else {
 1936|  39.5k|      cm->mi_params.set_mb_mi(&cm->mi_params, width, height, BLOCK_4X4);
 1937|  39.5k|    }
 1938|  95.7k|    av1_init_mi_buffers(&cm->mi_params);
 1939|  95.7k|    cm->width = width;
 1940|  95.7k|    cm->height = height;
 1941|  95.7k|  }
 1942|       |
 1943|   203k|  ensure_mv_buffer(cm->cur_frame, cm);
 1944|   203k|  cm->cur_frame->width = cm->width;
 1945|   203k|  cm->cur_frame->height = cm->height;
 1946|   203k|}
decodeframe.c:setup_render_size:
 1879|   171k|                                     struct aom_read_bit_buffer *rb) {
 1880|   171k|  cm->render_width = cm->superres_upscaled_width;
 1881|   171k|  cm->render_height = cm->superres_upscaled_height;
 1882|   171k|  if (aom_rb_read_bit(rb))
  ------------------
  |  Branch (1882:7): [True: 36.2k, False: 134k]
  ------------------
 1883|  36.2k|    read_frame_size(rb, 16, 16, &cm->render_width, &cm->render_height);
 1884|   171k|}
decodeframe.c:setup_buffer_pool:
 1948|   197k|static inline void setup_buffer_pool(AV1_COMMON *cm) {
 1949|   197k|  BufferPool *const pool = cm->buffer_pool;
 1950|   197k|  const SequenceHeader *const seq_params = cm->seq_params;
 1951|       |
 1952|   197k|  lock_buffer_pool(pool);
 1953|   197k|  if (aom_realloc_frame_buffer(
  ------------------
  |  Branch (1953:7): [True: 101, False: 197k]
  ------------------
 1954|   197k|          &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x,
 1955|   197k|          seq_params->subsampling_y, seq_params->use_highbitdepth,
 1956|   197k|          AOM_DEC_BORDER_IN_PIXELS, cm->features.byte_alignment,
  ------------------
  |  |   35|   197k|#define AOM_DEC_BORDER_IN_PIXELS 64
  ------------------
 1957|   197k|          &cm->cur_frame->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv,
 1958|   197k|          false, 0)) {
 1959|    101|    unlock_buffer_pool(pool);
 1960|    101|    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
 1961|    101|                       "Failed to allocate frame buffer");
 1962|    101|  }
 1963|   197k|  unlock_buffer_pool(pool);
 1964|       |
 1965|   197k|  cm->cur_frame->buf.bit_depth = (unsigned int)seq_params->bit_depth;
 1966|   197k|  cm->cur_frame->buf.color_primaries = seq_params->color_primaries;
 1967|   197k|  cm->cur_frame->buf.transfer_characteristics =
 1968|   197k|      seq_params->transfer_characteristics;
 1969|   197k|  cm->cur_frame->buf.matrix_coefficients = seq_params->matrix_coefficients;
 1970|   197k|  cm->cur_frame->buf.monochrome = seq_params->monochrome;
 1971|   197k|  cm->cur_frame->buf.chroma_sample_position =
 1972|   197k|      seq_params->chroma_sample_position;
 1973|   197k|  cm->cur_frame->buf.color_range = seq_params->color_range;
 1974|   197k|  cm->cur_frame->buf.render_width = cm->render_width;
 1975|   197k|  cm->cur_frame->buf.render_height = cm->render_height;
 1976|   197k|}
decodeframe.c:setup_frame_size_with_refs:
 2018|  38.8k|                                              struct aom_read_bit_buffer *rb) {
 2019|  38.8k|  int width, height;
 2020|  38.8k|  int found = 0;
 2021|  38.8k|  int has_valid_ref_frame = 0;
 2022|   144k|  for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
  ------------------
  |  Branch (2022:28): [True: 138k, False: 6.36k]
  ------------------
 2023|   138k|    if (aom_rb_read_bit(rb)) {
  ------------------
  |  Branch (2023:9): [True: 32.4k, False: 105k]
  ------------------
 2024|  32.4k|      const RefCntBuffer *const ref_buf = get_ref_frame_buf(cm, i);
 2025|       |      // This will never be NULL in a normal stream, as streams are required to
 2026|       |      // have a shown keyframe before any inter frames, which would refresh all
 2027|       |      // the reference buffers. However, it might be null if we're starting in
 2028|       |      // the middle of a stream, and static analysis will error if we don't do
 2029|       |      // a null check here.
 2030|  32.4k|      if (ref_buf == NULL) {
  ------------------
  |  Branch (2030:11): [True: 0, False: 32.4k]
  ------------------
 2031|      0|        aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
 2032|      0|                           "Invalid condition: invalid reference buffer");
 2033|  32.4k|      } else {
 2034|  32.4k|        const YV12_BUFFER_CONFIG *const buf = &ref_buf->buf;
 2035|  32.4k|        width = buf->y_crop_width;
 2036|  32.4k|        height = buf->y_crop_height;
 2037|  32.4k|        cm->render_width = buf->render_width;
 2038|  32.4k|        cm->render_height = buf->render_height;
 2039|  32.4k|        setup_superres(cm, rb, &width, &height);
 2040|  32.4k|        resize_context_buffers(cm, width, height);
 2041|  32.4k|        found = 1;
 2042|  32.4k|        break;
 2043|  32.4k|      }
 2044|  32.4k|    }
 2045|   138k|  }
 2046|       |
 2047|  38.8k|  const SequenceHeader *const seq_params = cm->seq_params;
 2048|  38.8k|  if (!found) {
  ------------------
  |  Branch (2048:7): [True: 6.36k, False: 32.4k]
  ------------------
 2049|  6.36k|    int num_bits_width = seq_params->num_bits_width;
 2050|  6.36k|    int num_bits_height = seq_params->num_bits_height;
 2051|       |
 2052|  6.36k|    read_frame_size(rb, num_bits_width, num_bits_height, &width, &height);
 2053|  6.36k|    setup_superres(cm, rb, &width, &height);
 2054|  6.36k|    resize_context_buffers(cm, width, height);
 2055|  6.36k|    setup_render_size(cm, rb);
 2056|  6.36k|  }
 2057|       |
 2058|  38.8k|  if (width <= 0 || height <= 0)
  ------------------
  |  Branch (2058:7): [True: 0, False: 38.8k]
  |  Branch (2058:21): [True: 0, False: 38.8k]
  ------------------
 2059|      0|    aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
 2060|      0|                       "Invalid frame size");
 2061|       |
 2062|       |  // Check to make sure at least one of frames that this frame references
 2063|       |  // has valid dimensions.
 2064|   310k|  for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
  ------------------
  |  Branch (2064:28): [True: 271k, False: 38.8k]
  ------------------
 2065|   271k|    const RefCntBuffer *const ref_frame = get_ref_frame_buf(cm, i);
 2066|   271k|    has_valid_ref_frame |=
 2067|   271k|        valid_ref_frame_size(ref_frame->buf.y_crop_width,
 2068|   271k|                             ref_frame->buf.y_crop_height, width, height);
 2069|   271k|  }
 2070|  38.8k|  if (!has_valid_ref_frame)
  ------------------
  |  Branch (2070:7): [True: 2.54k, False: 36.2k]
  ------------------
 2071|  2.54k|    aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
 2072|  2.54k|                       "Referenced frame has invalid size");
 2073|   279k|  for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
  ------------------
  |  Branch (2073:28): [True: 240k, False: 38.8k]
  ------------------
 2074|   240k|    const RefCntBuffer *const ref_frame = get_ref_frame_buf(cm, i);
 2075|   240k|    if (!valid_ref_frame_img_fmt(
  ------------------
  |  Branch (2075:9): [True: 3.58k, False: 236k]
  ------------------
 2076|   240k|            ref_frame->buf.bit_depth, ref_frame->buf.subsampling_x,
 2077|   240k|            ref_frame->buf.subsampling_y, seq_params->bit_depth,
 2078|   240k|            seq_params->subsampling_x, seq_params->subsampling_y))
 2079|  3.58k|      aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
 2080|  3.58k|                         "Referenced frame has incompatible color format");
 2081|   240k|  }
 2082|  38.8k|  setup_buffer_pool(cm);
 2083|  38.8k|}
decodeframe.c:valid_ref_frame_img_fmt:
 2012|   240k|                                          int this_xss, int this_yss) {
 2013|   240k|  return ref_bit_depth == this_bit_depth && ref_xss == this_xss &&
  ------------------
  |  Branch (2013:10): [True: 236k, False: 3.58k]
  |  Branch (2013:45): [True: 236k, False: 0]
  ------------------
 2014|   240k|         ref_yss == this_yss;
  ------------------
  |  Branch (2014:10): [True: 236k, False: 0]
  ------------------
 2015|   240k|}
decodeframe.c:read_frame_interp_filter:
 1867|  53.0k|static InterpFilter read_frame_interp_filter(struct aom_read_bit_buffer *rb) {
 1868|  53.0k|  return aom_rb_read_bit(rb) ? SWITCHABLE
  ------------------
  |  Branch (1868:10): [True: 17.8k, False: 35.2k]
  ------------------
 1869|  53.0k|                             : aom_rb_read_literal(rb, LOG_SWITCHABLE_FILTERS);
  ------------------
  |  |   98|  35.2k|#define LOG_SWITCHABLE_FILTERS 2
  ------------------
 1870|  53.0k|}
decodeframe.c:update_ref_frame_id:
 4426|   202k|static inline void update_ref_frame_id(AV1Decoder *const pbi) {
 4427|   202k|  AV1_COMMON *const cm = &pbi->common;
 4428|   202k|  int refresh_frame_flags = cm->current_frame.refresh_frame_flags;
 4429|  1.82M|  for (int i = 0; i < REF_FRAMES; i++) {
  ------------------
  |  Branch (4429:19): [True: 1.62M, False: 202k]
  ------------------
 4430|  1.62M|    if ((refresh_frame_flags >> i) & 1) {
  ------------------
  |  Branch (4430:9): [True: 842k, False: 781k]
  ------------------
 4431|   842k|      cm->ref_frame_id[i] = cm->current_frame_id;
 4432|   842k|      pbi->valid_for_referencing[i] = 1;
 4433|   842k|    }
 4434|  1.62M|  }
 4435|   202k|}
decodeframe.c:read_tile_info:
 2182|   190k|                                  struct aom_read_bit_buffer *const rb) {
 2183|   190k|  AV1_COMMON *const cm = &pbi->common;
 2184|       |
 2185|   190k|  read_tile_info_max_tile(cm, rb);
 2186|       |
 2187|   190k|  pbi->context_update_tile_id = 0;
 2188|   190k|  if (cm->tiles.rows * cm->tiles.cols > 1) {
  ------------------
  |  Branch (2188:7): [True: 25.0k, False: 165k]
  ------------------
 2189|       |    // tile to use for cdf update
 2190|  25.0k|    pbi->context_update_tile_id =
 2191|  25.0k|        aom_rb_read_literal(rb, cm->tiles.log2_rows + cm->tiles.log2_cols);
 2192|  25.0k|    if (pbi->context_update_tile_id >= cm->tiles.rows * cm->tiles.cols) {
  ------------------
  |  Branch (2192:9): [True: 1.11k, False: 23.9k]
  ------------------
 2193|  1.11k|      aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 2194|  1.11k|                         "Invalid context_update_tile_id");
 2195|  1.11k|    }
 2196|       |    // tile size magnitude
 2197|  25.0k|    pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1;
 2198|  25.0k|  }
 2199|   190k|}
decodeframe.c:read_tile_info_max_tile:
 2098|   190k|    AV1_COMMON *const cm, struct aom_read_bit_buffer *const rb) {
 2099|   190k|  const SequenceHeader *const seq_params = cm->seq_params;
 2100|   190k|  CommonTileParams *const tiles = &cm->tiles;
 2101|   190k|  int width_sb =
 2102|   190k|      CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2);
  ------------------
  |  |   62|   190k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
 2103|   190k|  int height_sb =
 2104|   190k|      CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, seq_params->mib_size_log2);
  ------------------
  |  |   62|   190k|#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
  ------------------
 2105|       |
 2106|   190k|  av1_get_tile_limits(cm);
 2107|   190k|  tiles->uniform_spacing = aom_rb_read_bit(rb);
 2108|       |
 2109|       |  // Read tile columns
 2110|   190k|  if (tiles->uniform_spacing) {
  ------------------
  |  Branch (2110:7): [True: 100k, False: 90.3k]
  ------------------
 2111|   100k|    tiles->log2_cols = tiles->min_log2_cols;
 2112|   110k|    while (tiles->log2_cols < tiles->max_log2_cols) {
  ------------------
  |  Branch (2112:12): [True: 74.3k, False: 36.1k]
  ------------------
 2113|  74.3k|      if (!aom_rb_read_bit(rb)) {
  ------------------
  |  Branch (2113:11): [True: 63.9k, False: 10.4k]
  ------------------
 2114|  63.9k|        break;
 2115|  63.9k|      }
 2116|  10.4k|      tiles->log2_cols++;
 2117|  10.4k|    }
 2118|   100k|  } else {
 2119|  90.3k|    int i;
 2120|  90.3k|    int start_sb;
 2121|   214k|    for (i = 0, start_sb = 0; width_sb > 0 && i < MAX_TILE_COLS; i++) {
  ------------------
  |  |   54|   123k|#define MAX_TILE_COLS 64
  ------------------
  |  Branch (2121:31): [True: 123k, False: 90.3k]
  |  Branch (2121:47): [True: 123k, False: 2]
  ------------------
 2122|   123k|      const int size_sb =
 2123|   123k|          1 + rb_read_uniform(rb, AOMMIN(width_sb, tiles->max_width_sb));
  ------------------
  |  |   34|   123k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 115k, False: 8.13k]
  |  |  ------------------
  ------------------
 2124|   123k|      tiles->col_start_sb[i] = start_sb;
 2125|   123k|      start_sb += size_sb;
 2126|   123k|      width_sb -= size_sb;
 2127|   123k|    }
 2128|  90.3k|    tiles->cols = i;
 2129|  90.3k|    tiles->col_start_sb[i] = start_sb + width_sb;
 2130|  90.3k|  }
 2131|   190k|  av1_calculate_tile_cols(seq_params, cm->mi_params.mi_rows,
 2132|   190k|                          cm->mi_params.mi_cols, tiles);
 2133|       |
 2134|       |  // Read tile rows
 2135|   190k|  if (tiles->uniform_spacing) {
  ------------------
  |  Branch (2135:7): [True: 100k, False: 90.3k]
  ------------------
 2136|   100k|    tiles->log2_rows = tiles->min_log2_rows;
 2137|   112k|    while (tiles->log2_rows < tiles->max_log2_rows) {
  ------------------
  |  Branch (2137:12): [True: 74.2k, False: 38.0k]
  ------------------
 2138|  74.2k|      if (!aom_rb_read_bit(rb)) {
  ------------------
  |  Branch (2138:11): [True: 61.9k, False: 12.2k]
  ------------------
 2139|  61.9k|        break;
 2140|  61.9k|      }
 2141|  12.2k|      tiles->log2_rows++;
 2142|  12.2k|    }
 2143|   100k|  } else {
 2144|  90.3k|    int i;
 2145|  90.3k|    int start_sb;
 2146|   220k|    for (i = 0, start_sb = 0; height_sb > 0 && i < MAX_TILE_ROWS; i++) {
  ------------------
  |  |   53|   130k|#define MAX_TILE_ROWS 64
  ------------------
  |  Branch (2146:31): [True: 130k, False: 90.3k]
  |  Branch (2146:48): [True: 130k, False: 1]
  ------------------
 2147|   130k|      const int size_sb =
 2148|   130k|          1 + rb_read_uniform(rb, AOMMIN(height_sb, tiles->max_height_sb));
  ------------------
  |  |   34|   130k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 37.5k, False: 92.8k]
  |  |  ------------------
  ------------------
 2149|   130k|      tiles->row_start_sb[i] = start_sb;
 2150|   130k|      start_sb += size_sb;
 2151|   130k|      height_sb -= size_sb;
 2152|   130k|    }
 2153|  90.3k|    tiles->rows = i;
 2154|  90.3k|    tiles->row_start_sb[i] = start_sb + height_sb;
 2155|  90.3k|  }
 2156|   190k|  av1_calculate_tile_rows(seq_params, cm->mi_params.mi_rows, tiles);
 2157|   190k|}
decodeframe.c:rb_read_uniform:
 2086|   254k|static int rb_read_uniform(struct aom_read_bit_buffer *const rb, int n) {
 2087|   254k|  const int l = get_unsigned_bits(n);
 2088|   254k|  const int m = (1 << l) - n;
 2089|   254k|  const int v = aom_rb_read_literal(rb, l - 1);
 2090|   254k|  assert(l != 0);
 2091|   254k|  if (v < m)
  ------------------
  |  Branch (2091:7): [True: 230k, False: 23.2k]
  ------------------
 2092|   230k|    return v;
 2093|  23.2k|  else
 2094|  23.2k|    return (v << 1) - m + aom_rb_read_bit(rb);
 2095|   254k|}
decodeframe.c:setup_quantization:
 1778|   188k|                                      struct aom_read_bit_buffer *rb) {
 1779|   188k|  quant_params->base_qindex = aom_rb_read_literal(rb, QINDEX_BITS);
  ------------------
  |  |   28|   188k|#define QINDEX_BITS 8
  ------------------
 1780|   188k|  quant_params->y_dc_delta_q = read_delta_q(rb);
 1781|   188k|  if (num_planes > 1) {
  ------------------
  |  Branch (1781:7): [True: 153k, False: 35.8k]
  ------------------
 1782|   153k|    int diff_uv_delta = 0;
 1783|   153k|    if (separate_uv_delta_q) diff_uv_delta = aom_rb_read_bit(rb);
  ------------------
  |  Branch (1783:9): [True: 6.92k, False: 146k]
  ------------------
 1784|   153k|    quant_params->u_dc_delta_q = read_delta_q(rb);
 1785|   153k|    quant_params->u_ac_delta_q = read_delta_q(rb);
 1786|   153k|    if (diff_uv_delta) {
  ------------------
  |  Branch (1786:9): [True: 2.51k, False: 150k]
  ------------------
 1787|  2.51k|      quant_params->v_dc_delta_q = read_delta_q(rb);
 1788|  2.51k|      quant_params->v_ac_delta_q = read_delta_q(rb);
 1789|   150k|    } else {
 1790|   150k|      quant_params->v_dc_delta_q = quant_params->u_dc_delta_q;
 1791|   150k|      quant_params->v_ac_delta_q = quant_params->u_ac_delta_q;
 1792|   150k|    }
 1793|   153k|  } else {
 1794|  35.8k|    quant_params->u_dc_delta_q = 0;
 1795|  35.8k|    quant_params->u_ac_delta_q = 0;
 1796|  35.8k|    quant_params->v_dc_delta_q = 0;
 1797|  35.8k|    quant_params->v_ac_delta_q = 0;
 1798|  35.8k|  }
 1799|   188k|  quant_params->using_qmatrix = aom_rb_read_bit(rb);
 1800|   188k|  if (quant_params->using_qmatrix) {
  ------------------
  |  Branch (1800:7): [True: 40.1k, False: 148k]
  ------------------
 1801|  40.1k|    quant_params->qmatrix_level_y = aom_rb_read_literal(rb, QM_LEVEL_BITS);
  ------------------
  |  |   30|  40.1k|#define QM_LEVEL_BITS 4
  ------------------
 1802|  40.1k|    quant_params->qmatrix_level_u = aom_rb_read_literal(rb, QM_LEVEL_BITS);
  ------------------
  |  |   30|  40.1k|#define QM_LEVEL_BITS 4
  ------------------
 1803|  40.1k|    if (!separate_uv_delta_q)
  ------------------
  |  Branch (1803:9): [True: 38.5k, False: 1.59k]
  ------------------
 1804|  38.5k|      quant_params->qmatrix_level_v = quant_params->qmatrix_level_u;
 1805|  1.59k|    else
 1806|  1.59k|      quant_params->qmatrix_level_v = aom_rb_read_literal(rb, QM_LEVEL_BITS);
  ------------------
  |  |   30|  1.59k|#define QM_LEVEL_BITS 4
  ------------------
 1807|   148k|  } else {
 1808|   148k|    quant_params->qmatrix_level_y = 0;
 1809|   148k|    quant_params->qmatrix_level_u = 0;
 1810|   148k|    quant_params->qmatrix_level_v = 0;
 1811|   148k|  }
 1812|   188k|}
decodeframe.c:read_delta_q:
 1772|   500k|static inline int read_delta_q(struct aom_read_bit_buffer *rb) {
 1773|   500k|  return aom_rb_read_bit(rb) ? aom_rb_read_inv_signed_literal(rb, 6) : 0;
  ------------------
  |  Branch (1773:10): [True: 149k, False: 350k]
  ------------------
 1774|   500k|}
decodeframe.c:setup_segmentation:
 1432|   188k|                                      struct aom_read_bit_buffer *rb) {
 1433|   188k|  struct segmentation *const seg = &cm->seg;
 1434|       |
 1435|   188k|  seg->update_map = 0;
 1436|   188k|  seg->update_data = 0;
 1437|   188k|  seg->temporal_update = 0;
 1438|       |
 1439|   188k|  seg->enabled = aom_rb_read_bit(rb);
 1440|   188k|  if (!seg->enabled) {
  ------------------
  |  Branch (1440:7): [True: 163k, False: 24.9k]
  ------------------
 1441|   163k|    if (cm->cur_frame->seg_map) {
  ------------------
  |  Branch (1441:9): [True: 163k, False: 0]
  ------------------
 1442|   163k|      memset(cm->cur_frame->seg_map, 0,
 1443|   163k|             (cm->cur_frame->mi_rows * cm->cur_frame->mi_cols));
 1444|   163k|    }
 1445|       |
 1446|   163k|    memset(seg, 0, sizeof(*seg));
 1447|   163k|    segfeatures_copy(&cm->cur_frame->seg, seg);
 1448|   163k|    return;
 1449|   163k|  }
 1450|  24.9k|  if (cm->seg.enabled && cm->prev_frame &&
  ------------------
  |  Branch (1450:7): [True: 24.9k, False: 7]
  |  Branch (1450:26): [True: 9.37k, False: 15.5k]
  ------------------
 1451|  24.9k|      (cm->mi_params.mi_rows == cm->prev_frame->mi_rows) &&
  ------------------
  |  Branch (1451:7): [True: 7.65k, False: 1.71k]
  ------------------
 1452|  24.9k|      (cm->mi_params.mi_cols == cm->prev_frame->mi_cols)) {
  ------------------
  |  Branch (1452:7): [True: 7.62k, False: 31]
  ------------------
 1453|  7.62k|    cm->last_frame_seg_map = cm->prev_frame->seg_map;
 1454|  17.3k|  } else {
 1455|  17.3k|    cm->last_frame_seg_map = NULL;
 1456|  17.3k|  }
 1457|       |  // Read update flags
 1458|  24.9k|  if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) {
  ------------------
  |  |   66|  24.9k|#define PRIMARY_REF_NONE 7
  ------------------
  |  Branch (1458:7): [True: 15.5k, False: 9.38k]
  ------------------
 1459|       |    // These frames can't use previous frames, so must signal map + features
 1460|  15.5k|    seg->update_map = 1;
 1461|  15.5k|    seg->temporal_update = 0;
 1462|  15.5k|    seg->update_data = 1;
 1463|  15.5k|  } else {
 1464|  9.38k|    seg->update_map = aom_rb_read_bit(rb);
 1465|  9.38k|    if (seg->update_map) {
  ------------------
  |  Branch (1465:9): [True: 5.91k, False: 3.46k]
  ------------------
 1466|  5.91k|      seg->temporal_update = aom_rb_read_bit(rb);
 1467|  5.91k|    } else {
 1468|  3.46k|      seg->temporal_update = 0;
 1469|  3.46k|    }
 1470|  9.38k|    seg->update_data = aom_rb_read_bit(rb);
 1471|  9.38k|  }
 1472|       |
 1473|       |  // Segmentation data update
 1474|  24.9k|  if (seg->update_data) {
  ------------------
  |  Branch (1474:7): [True: 18.4k, False: 6.52k]
  ------------------
 1475|  18.4k|    av1_clearall_segfeatures(seg);
 1476|       |
 1477|   161k|    for (int i = 0; i < MAX_SEGMENTS; i++) {
  ------------------
  |  |   21|   161k|#define MAX_SEGMENTS 8
  ------------------
  |  Branch (1477:21): [True: 142k, False: 18.4k]
  ------------------
 1478|  1.27M|      for (int j = 0; j < SEG_LVL_MAX; j++) {
  ------------------
  |  Branch (1478:23): [True: 1.13M, False: 142k]
  ------------------
 1479|  1.13M|        int data = 0;
 1480|  1.13M|        const int feature_enabled = aom_rb_read_bit(rb);
 1481|  1.13M|        if (feature_enabled) {
  ------------------
  |  Branch (1481:13): [True: 278k, False: 856k]
  ------------------
 1482|   278k|          av1_enable_segfeature(seg, i, j);
 1483|       |
 1484|   278k|          const int data_max = av1_seg_feature_data_max(j);
 1485|   278k|          const int data_min = -data_max;
 1486|   278k|          const int ubits = get_unsigned_bits(data_max);
 1487|       |
 1488|   278k|          if (av1_is_segfeature_signed(j)) {
  ------------------
  |  Branch (1488:15): [True: 167k, False: 111k]
  ------------------
 1489|   167k|            data = aom_rb_read_inv_signed_literal(rb, ubits);
 1490|   167k|          } else {
 1491|   111k|            data = aom_rb_read_literal(rb, ubits);
 1492|   111k|          }
 1493|       |
 1494|   278k|          data = clamp(data, data_min, data_max);
 1495|   278k|        }
 1496|  1.13M|        av1_set_segdata(seg, i, j, data);
 1497|  1.13M|      }
 1498|   142k|    }
 1499|  18.4k|    av1_calculate_segdata(seg);
 1500|  18.4k|  } else if (cm->prev_frame) {
  ------------------
  |  Branch (1500:14): [True: 6.51k, False: 8]
  ------------------
 1501|  6.51k|    segfeatures_copy(seg, &cm->prev_frame->seg);
 1502|  6.51k|  }
 1503|  24.9k|  segfeatures_copy(&cm->cur_frame->seg, seg);
 1504|  24.9k|}
decodeframe.c:setup_segmentation_dequant:
 1824|   186k|                                              MACROBLOCKD *const xd) {
 1825|   186k|  const int bit_depth = cm->seq_params->bit_depth;
 1826|       |  // When segmentation is disabled, only the first value is used.  The
 1827|       |  // remaining are don't cares.
 1828|   186k|  const int max_segments = cm->seg.enabled ? MAX_SEGMENTS : 1;
  ------------------
  |  |   21|  23.0k|#define MAX_SEGMENTS 8
  ------------------
  |  Branch (1828:28): [True: 23.0k, False: 163k]
  ------------------
 1829|   186k|  CommonQuantParams *const quant_params = &cm->quant_params;
 1830|   533k|  for (int i = 0; i < max_segments; ++i) {
  ------------------
  |  Branch (1830:19): [True: 347k, False: 186k]
  ------------------
 1831|   347k|    const int qindex = xd->qindex[i];
 1832|   347k|    quant_params->y_dequant_QTX[i][0] =
 1833|   347k|        av1_dc_quant_QTX(qindex, quant_params->y_dc_delta_q, bit_depth);
 1834|   347k|    quant_params->y_dequant_QTX[i][1] = av1_ac_quant_QTX(qindex, 0, bit_depth);
 1835|   347k|    quant_params->u_dequant_QTX[i][0] =
 1836|   347k|        av1_dc_quant_QTX(qindex, quant_params->u_dc_delta_q, bit_depth);
 1837|   347k|    quant_params->u_dequant_QTX[i][1] =
 1838|   347k|        av1_ac_quant_QTX(qindex, quant_params->u_ac_delta_q, bit_depth);
 1839|   347k|    quant_params->v_dequant_QTX[i][0] =
 1840|   347k|        av1_dc_quant_QTX(qindex, quant_params->v_dc_delta_q, bit_depth);
 1841|   347k|    quant_params->v_dequant_QTX[i][1] =
 1842|   347k|        av1_ac_quant_QTX(qindex, quant_params->v_ac_delta_q, bit_depth);
 1843|   347k|    const int use_qmatrix = av1_use_qmatrix(quant_params, xd, i);
 1844|       |    // NB: depends on base index so there is only 1 set per frame
 1845|       |    // No quant weighting when lossless or signalled not using QM
 1846|   347k|    const int qmlevel_y =
 1847|   347k|        use_qmatrix ? quant_params->qmatrix_level_y : NUM_QM_LEVELS - 1;
  ------------------
  |  |   31|   257k|#define NUM_QM_LEVELS (1 << QM_LEVEL_BITS)
  |  |  ------------------
  |  |  |  |   30|   257k|#define QM_LEVEL_BITS 4
  |  |  ------------------
  ------------------
  |  Branch (1847:9): [True: 90.0k, False: 257k]
  ------------------
 1848|  6.95M|    for (int j = 0; j < TX_SIZES_ALL; ++j) {
  ------------------
  |  Branch (1848:21): [True: 6.60M, False: 347k]
  ------------------
 1849|  6.60M|      quant_params->y_iqmatrix[i][j] =
 1850|  6.60M|          get_iqmatrix(quant_params, qmlevel_y, AOM_PLANE_Y, j);
  ------------------
  |  |  226|  6.60M|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
 1851|  6.60M|    }
 1852|   347k|    const int qmlevel_u =
 1853|   347k|        use_qmatrix ? quant_params->qmatrix_level_u : NUM_QM_LEVELS - 1;
  ------------------
  |  |   31|   257k|#define NUM_QM_LEVELS (1 << QM_LEVEL_BITS)
  |  |  ------------------
  |  |  |  |   30|   257k|#define QM_LEVEL_BITS 4
  |  |  ------------------
  ------------------
  |  Branch (1853:9): [True: 90.0k, False: 257k]
  ------------------
 1854|  6.95M|    for (int j = 0; j < TX_SIZES_ALL; ++j) {
  ------------------
  |  Branch (1854:21): [True: 6.60M, False: 347k]
  ------------------
 1855|  6.60M|      quant_params->u_iqmatrix[i][j] =
 1856|  6.60M|          get_iqmatrix(quant_params, qmlevel_u, AOM_PLANE_U, j);
  ------------------
  |  |  227|  6.60M|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
 1857|  6.60M|    }
 1858|   347k|    const int qmlevel_v =
 1859|   347k|        use_qmatrix ? quant_params->qmatrix_level_v : NUM_QM_LEVELS - 1;
  ------------------
  |  |   31|   257k|#define NUM_QM_LEVELS (1 << QM_LEVEL_BITS)
  |  |  ------------------
  |  |  |  |   30|   257k|#define QM_LEVEL_BITS 4
  |  |  ------------------
  ------------------
  |  Branch (1859:9): [True: 90.0k, False: 257k]
  ------------------
 1860|  6.95M|    for (int j = 0; j < TX_SIZES_ALL; ++j) {
  ------------------
  |  Branch (1860:21): [True: 6.60M, False: 347k]
  ------------------
 1861|  6.60M|      quant_params->v_iqmatrix[i][j] =
 1862|  6.60M|          get_iqmatrix(quant_params, qmlevel_v, AOM_PLANE_V, j);
  ------------------
  |  |  228|  6.60M|#define AOM_PLANE_V 2      /**< V (Chroma) plane */
  ------------------
 1863|  6.60M|    }
 1864|   347k|  }
 1865|   186k|}
decodeframe.c:get_iqmatrix:
 1816|  19.8M|                                    int qmlevel, int plane, TX_SIZE tx_size) {
 1817|  19.8M|  assert(quant_params->giqmatrix[qmlevel][plane][tx_size] != NULL ||
 1818|  19.8M|         qmlevel == NUM_QM_LEVELS - 1);
 1819|  19.8M|  return quant_params->giqmatrix[qmlevel][plane][tx_size];
 1820|  19.8M|}
decodeframe.c:setup_loopfilter:
 1705|   186k|                                    struct aom_read_bit_buffer *rb) {
 1706|   186k|  const int num_planes = av1_num_planes(cm);
 1707|   186k|  struct loopfilter *lf = &cm->lf;
 1708|       |
 1709|   186k|  if (cm->features.allow_intrabc || cm->features.coded_lossless) {
  ------------------
  |  Branch (1709:7): [True: 44.0k, False: 142k]
  |  Branch (1709:37): [True: 35.1k, False: 107k]
  ------------------
 1710|       |    // write default deltas to frame buffer
 1711|  79.2k|    av1_set_default_ref_deltas(cm->cur_frame->ref_deltas);
 1712|  79.2k|    av1_set_default_mode_deltas(cm->cur_frame->mode_deltas);
 1713|  79.2k|    return;
 1714|  79.2k|  }
 1715|   107k|  assert(!cm->features.coded_lossless);
 1716|   107k|  if (cm->prev_frame) {
  ------------------
  |  Branch (1716:7): [True: 36.1k, False: 71.0k]
  ------------------
 1717|       |    // write deltas to frame buffer
 1718|  36.1k|    memcpy(lf->ref_deltas, cm->prev_frame->ref_deltas, REF_FRAMES);
 1719|  36.1k|    memcpy(lf->mode_deltas, cm->prev_frame->mode_deltas, MAX_MODE_LF_DELTAS);
  ------------------
  |  |   74|  36.1k|#define MAX_MODE_LF_DELTAS 2
  ------------------
 1720|  71.0k|  } else {
 1721|  71.0k|    av1_set_default_ref_deltas(lf->ref_deltas);
 1722|  71.0k|    av1_set_default_mode_deltas(lf->mode_deltas);
 1723|  71.0k|  }
 1724|   107k|  lf->filter_level[0] = aom_rb_read_literal(rb, 6);
 1725|   107k|  lf->filter_level[1] = aom_rb_read_literal(rb, 6);
 1726|   107k|  if (num_planes > 1) {
  ------------------
  |  Branch (1726:7): [True: 89.6k, False: 17.4k]
  ------------------
 1727|  89.6k|    if (lf->filter_level[0] || lf->filter_level[1]) {
  ------------------
  |  Branch (1727:9): [True: 39.4k, False: 50.1k]
  |  Branch (1727:32): [True: 15.1k, False: 35.0k]
  ------------------
 1728|  54.6k|      lf->filter_level_u = aom_rb_read_literal(rb, 6);
 1729|  54.6k|      lf->filter_level_v = aom_rb_read_literal(rb, 6);
 1730|  54.6k|    }
 1731|  89.6k|  }
 1732|   107k|  lf->sharpness_level = aom_rb_read_literal(rb, 3);
 1733|       |
 1734|       |  // Read in loop filter deltas applied at the MB level based on mode or ref
 1735|       |  // frame.
 1736|   107k|  lf->mode_ref_delta_update = 0;
 1737|       |
 1738|   107k|  lf->mode_ref_delta_enabled = aom_rb_read_bit(rb);
 1739|   107k|  if (lf->mode_ref_delta_enabled) {
  ------------------
  |  Branch (1739:7): [True: 31.9k, False: 75.2k]
  ------------------
 1740|  31.9k|    lf->mode_ref_delta_update = aom_rb_read_bit(rb);
 1741|  31.9k|    if (lf->mode_ref_delta_update) {
  ------------------
  |  Branch (1741:9): [True: 15.3k, False: 16.5k]
  ------------------
 1742|   137k|      for (int i = 0; i < REF_FRAMES; i++)
  ------------------
  |  Branch (1742:23): [True: 122k, False: 15.3k]
  ------------------
 1743|   122k|        if (aom_rb_read_bit(rb))
  ------------------
  |  Branch (1743:13): [True: 48.9k, False: 73.6k]
  ------------------
 1744|  48.9k|          lf->ref_deltas[i] = aom_rb_read_inv_signed_literal(rb, 6);
 1745|       |
 1746|  44.5k|      for (int i = 0; i < MAX_MODE_LF_DELTAS; i++)
  ------------------
  |  |   74|  44.5k|#define MAX_MODE_LF_DELTAS 2
  ------------------
  |  Branch (1746:23): [True: 29.1k, False: 15.3k]
  ------------------
 1747|  29.1k|        if (aom_rb_read_bit(rb))
  ------------------
  |  Branch (1747:13): [True: 4.96k, False: 24.2k]
  ------------------
 1748|  4.96k|          lf->mode_deltas[i] = aom_rb_read_inv_signed_literal(rb, 6);
 1749|  15.3k|    }
 1750|  31.9k|  }
 1751|       |
 1752|       |  // write deltas to frame buffer
 1753|   107k|  memcpy(cm->cur_frame->ref_deltas, lf->ref_deltas, REF_FRAMES);
 1754|   107k|  memcpy(cm->cur_frame->mode_deltas, lf->mode_deltas, MAX_MODE_LF_DELTAS);
  ------------------
  |  |   74|   107k|#define MAX_MODE_LF_DELTAS 2
  ------------------
 1755|   107k|}
decodeframe.c:setup_cdef:
 1757|  71.3k|static inline void setup_cdef(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
 1758|  71.3k|  const int num_planes = av1_num_planes(cm);
 1759|  71.3k|  CdefInfo *const cdef_info = &cm->cdef_info;
 1760|       |
 1761|  71.3k|  if (cm->features.allow_intrabc) return;
  ------------------
  |  Branch (1761:7): [True: 8.65k, False: 62.7k]
  ------------------
 1762|  62.7k|  cdef_info->cdef_damping = aom_rb_read_literal(rb, 2) + 3;
 1763|  62.7k|  cdef_info->cdef_bits = aom_rb_read_literal(rb, 2);
 1764|  62.7k|  cdef_info->nb_cdef_strengths = 1 << cdef_info->cdef_bits;
 1765|   169k|  for (int i = 0; i < cdef_info->nb_cdef_strengths; i++) {
  ------------------
  |  Branch (1765:19): [True: 106k, False: 62.7k]
  ------------------
 1766|   106k|    cdef_info->cdef_strengths[i] = aom_rb_read_literal(rb, CDEF_STRENGTH_BITS);
  ------------------
  |  |   14|   106k|#define CDEF_STRENGTH_BITS 6
  ------------------
 1767|   106k|    cdef_info->cdef_uv_strengths[i] =
 1768|   106k|        num_planes > 1 ? aom_rb_read_literal(rb, CDEF_STRENGTH_BITS) : 0;
  ------------------
  |  |   14|  89.3k|#define CDEF_STRENGTH_BITS 6
  ------------------
  |  Branch (1768:9): [True: 89.3k, False: 17.5k]
  ------------------
 1769|   106k|  }
 1770|  62.7k|}
decodeframe.c:decode_restoration_mode:
 1507|  81.8k|                                           struct aom_read_bit_buffer *rb) {
 1508|  81.8k|  assert(!cm->features.all_lossless);
 1509|  81.8k|  const int num_planes = av1_num_planes(cm);
 1510|  81.8k|  if (cm->features.allow_intrabc) return;
  ------------------
  |  Branch (1510:7): [True: 9.41k, False: 72.4k]
  ------------------
 1511|  72.4k|  int all_none = 1, chroma_none = 1;
 1512|   272k|  for (int p = 0; p < num_planes; ++p) {
  ------------------
  |  Branch (1512:19): [True: 200k, False: 72.4k]
  ------------------
 1513|   200k|    RestorationInfo *rsi = &cm->rst_info[p];
 1514|   200k|    if (aom_rb_read_bit(rb)) {
  ------------------
  |  Branch (1514:9): [True: 50.4k, False: 149k]
  ------------------
 1515|  50.4k|      rsi->frame_restoration_type =
 1516|  50.4k|          aom_rb_read_bit(rb) ? RESTORE_SGRPROJ : RESTORE_WIENER;
  ------------------
  |  Branch (1516:11): [True: 37.7k, False: 12.7k]
  ------------------
 1517|   149k|    } else {
 1518|   149k|      rsi->frame_restoration_type =
 1519|   149k|          aom_rb_read_bit(rb) ? RESTORE_SWITCHABLE : RESTORE_NONE;
  ------------------
  |  Branch (1519:11): [True: 14.9k, False: 135k]
  ------------------
 1520|   149k|    }
 1521|   200k|    if (rsi->frame_restoration_type != RESTORE_NONE) {
  ------------------
  |  Branch (1521:9): [True: 65.3k, False: 135k]
  ------------------
 1522|  65.3k|      all_none = 0;
 1523|  65.3k|      chroma_none &= p == 0;
 1524|  65.3k|    }
 1525|   200k|  }
 1526|  72.4k|  if (!all_none) {
  ------------------
  |  Branch (1526:7): [True: 32.9k, False: 39.5k]
  ------------------
 1527|  32.9k|    assert(cm->seq_params->sb_size == BLOCK_64X64 ||
 1528|  32.9k|           cm->seq_params->sb_size == BLOCK_128X128);
 1529|  32.9k|    const int sb_size = cm->seq_params->sb_size == BLOCK_128X128 ? 128 : 64;
  ------------------
  |  Branch (1529:25): [True: 27.2k, False: 5.64k]
  ------------------
 1530|       |
 1531|   121k|    for (int p = 0; p < num_planes; ++p)
  ------------------
  |  Branch (1531:21): [True: 88.7k, False: 32.9k]
  ------------------
 1532|  88.7k|      cm->rst_info[p].restoration_unit_size = sb_size;
 1533|       |
 1534|  32.9k|    RestorationInfo *rsi = &cm->rst_info[0];
 1535|       |
 1536|  32.9k|    if (sb_size == 64) {
  ------------------
  |  Branch (1536:9): [True: 5.64k, False: 27.2k]
  ------------------
 1537|  5.64k|      rsi->restoration_unit_size <<= aom_rb_read_bit(rb);
 1538|  5.64k|    }
 1539|  32.9k|    if (rsi->restoration_unit_size > 64) {
  ------------------
  |  Branch (1539:9): [True: 29.4k, False: 3.44k]
  ------------------
 1540|  29.4k|      rsi->restoration_unit_size <<= aom_rb_read_bit(rb);
 1541|  29.4k|    }
 1542|  39.5k|  } else {
 1543|  39.5k|    const int size = RESTORATION_UNITSIZE_MAX;
  ------------------
  |  |   80|  39.5k|#define RESTORATION_UNITSIZE_MAX 256
  ------------------
 1544|   151k|    for (int p = 0; p < num_planes; ++p)
  ------------------
  |  Branch (1544:21): [True: 111k, False: 39.5k]
  ------------------
 1545|   111k|      cm->rst_info[p].restoration_unit_size = size;
 1546|  39.5k|  }
 1547|       |
 1548|  72.4k|  if (num_planes > 1) {
  ------------------
  |  Branch (1548:7): [True: 63.7k, False: 8.70k]
  ------------------
 1549|  63.7k|    int s =
 1550|  63.7k|        AOMMIN(cm->seq_params->subsampling_x, cm->seq_params->subsampling_y);
  ------------------
  |  |   34|  63.7k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 0, False: 63.7k]
  |  |  ------------------
  ------------------
 1551|  63.7k|    if (s && !chroma_none) {
  ------------------
  |  Branch (1551:9): [True: 48.3k, False: 15.4k]
  |  Branch (1551:14): [True: 18.6k, False: 29.6k]
  ------------------
 1552|  18.6k|      cm->rst_info[1].restoration_unit_size =
 1553|  18.6k|          cm->rst_info[0].restoration_unit_size >> (aom_rb_read_bit(rb) * s);
 1554|  45.0k|    } else {
 1555|  45.0k|      cm->rst_info[1].restoration_unit_size =
 1556|  45.0k|          cm->rst_info[0].restoration_unit_size;
 1557|  45.0k|    }
 1558|  63.7k|    cm->rst_info[2].restoration_unit_size =
 1559|  63.7k|        cm->rst_info[1].restoration_unit_size;
 1560|  63.7k|  }
 1561|  72.4k|}
decodeframe.c:read_tx_mode:
  140|   181k|                            int coded_lossless) {
  141|   181k|  if (coded_lossless) return ONLY_4X4;
  ------------------
  |  Branch (141:7): [True: 35.8k, False: 146k]
  ------------------
  142|   146k|  return aom_rb_read_bit(rb) ? TX_MODE_SELECT : TX_MODE_LARGEST;
  ------------------
  |  Branch (142:10): [True: 50.8k, False: 95.2k]
  ------------------
  143|   181k|}
decodeframe.c:read_frame_reference_mode:
  146|   181k|    const AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
  147|   181k|  if (frame_is_intra_only(cm)) {
  ------------------
  |  Branch (147:7): [True: 135k, False: 46.0k]
  ------------------
  148|   135k|    return SINGLE_REFERENCE;
  149|   135k|  } else {
  150|  46.0k|    return aom_rb_read_bit(rb) ? REFERENCE_MODE_SELECT : SINGLE_REFERENCE;
  ------------------
  |  Branch (150:12): [True: 26.2k, False: 19.8k]
  ------------------
  151|  46.0k|  }
  152|   181k|}
decodeframe.c:read_global_motion:
 4371|  46.0k|                                      struct aom_read_bit_buffer *rb) {
 4372|   364k|  for (int frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
  ------------------
  |  Branch (4372:32): [True: 318k, False: 46.0k]
  ------------------
 4373|   318k|    const WarpedMotionParams *ref_params =
 4374|   318k|        cm->prev_frame ? &cm->prev_frame->global_motion[frame]
  ------------------
  |  Branch (4374:9): [True: 264k, False: 53.2k]
  ------------------
 4375|   318k|                       : &default_warp_params;
 4376|   318k|    int good_params =
 4377|   318k|        read_global_motion_params(&cm->global_motion[frame], ref_params, rb,
 4378|   318k|                                  cm->features.allow_high_precision_mv);
 4379|   318k|    if (!good_params) {
  ------------------
  |  Branch (4379:9): [True: 9.88k, False: 308k]
  ------------------
 4380|       |#if WARPED_MOTION_DEBUG
 4381|       |      printf("Warning: unexpected global motion shear params from aomenc\n");
 4382|       |#endif
 4383|  9.88k|      cm->global_motion[frame].invalid = 1;
 4384|  9.88k|    }
 4385|       |
 4386|       |    // TODO(sarahparker, debargha): The logic in the commented out code below
 4387|       |    // does not work currently and causes mismatches when resize is on. Fix it
 4388|       |    // before turning the optimization back on.
 4389|       |    /*
 4390|       |    YV12_BUFFER_CONFIG *ref_buf = get_ref_frame(cm, frame);
 4391|       |    if (cm->width == ref_buf->y_crop_width &&
 4392|       |        cm->height == ref_buf->y_crop_height) {
 4393|       |      read_global_motion_params(&cm->global_motion[frame],
 4394|       |                                &cm->prev_frame->global_motion[frame], rb,
 4395|       |                                cm->features.allow_high_precision_mv);
 4396|       |    } else {
 4397|       |      cm->global_motion[frame] = default_warp_params;
 4398|       |    }
 4399|       |    */
 4400|       |    /*
 4401|       |    printf("Dec Ref %d [%d/%d]: %d %d %d %d\n",
 4402|       |           frame, cm->current_frame.frame_number, cm->show_frame,
 4403|       |           cm->global_motion[frame].wmmat[0],
 4404|       |           cm->global_motion[frame].wmmat[1],
 4405|       |           cm->global_motion[frame].wmmat[2],
 4406|       |           cm->global_motion[frame].wmmat[3]);
 4407|       |           */
 4408|   318k|  }
 4409|  46.0k|  memcpy(cm->cur_frame->global_motion, cm->global_motion,
 4410|  46.0k|         REF_FRAMES * sizeof(WarpedMotionParams));
 4411|  46.0k|}
decodeframe.c:read_global_motion_params:
 4303|   318k|                                     int allow_hp) {
 4304|   318k|  TransformationType type = aom_rb_read_bit(rb);
 4305|   318k|  if (type != IDENTITY) {
  ------------------
  |  Branch (4305:7): [True: 27.8k, False: 290k]
  ------------------
 4306|  27.8k|    if (aom_rb_read_bit(rb))
  ------------------
  |  Branch (4306:9): [True: 18.4k, False: 9.41k]
  ------------------
 4307|  18.4k|      type = ROTZOOM;
 4308|  9.41k|    else
 4309|  9.41k|      type = aom_rb_read_bit(rb) ? TRANSLATION : AFFINE;
  ------------------
  |  Branch (4309:14): [True: 3.27k, False: 6.14k]
  ------------------
 4310|  27.8k|  }
 4311|       |
 4312|   318k|  *params = default_warp_params;
 4313|   318k|  params->wmtype = type;
 4314|       |
 4315|   318k|  if (type >= ROTZOOM) {
  ------------------
  |  Branch (4315:7): [True: 24.5k, False: 293k]
  ------------------
 4316|  24.5k|    params->wmmat[2] = aom_rb_read_signed_primitive_refsubexpfin(
 4317|  24.5k|                           rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
  ------------------
  |  |  178|  24.5k|#define GM_ALPHA_MAX (1 << GM_ABS_ALPHA_BITS)
  |  |  ------------------
  |  |  |  |  173|  24.5k|#define GM_ABS_ALPHA_BITS 12
  |  |  ------------------
  ------------------
                                         rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
  ------------------
  |  |  163|  24.5k|#define SUBEXPFIN_K 3
  ------------------
 4318|  24.5k|                           (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) -
  ------------------
  |  |  174|  24.5k|#define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  24.5k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  ------------------
  |  |  |  |  172|  24.5k|#define GM_ALPHA_PREC_BITS 15
  |  |  ------------------
  ------------------
 4319|  24.5k|                               (1 << GM_ALPHA_PREC_BITS)) *
  ------------------
  |  |  172|  24.5k|#define GM_ALPHA_PREC_BITS 15
  ------------------
 4320|  24.5k|                           GM_ALPHA_DECODE_FACTOR +
  ------------------
  |  |  175|  24.5k|#define GM_ALPHA_DECODE_FACTOR (1 << GM_ALPHA_PREC_DIFF)
  |  |  ------------------
  |  |  |  |  174|  24.5k|#define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   96|  24.5k|#define WARPEDMODEL_PREC_BITS 16
  |  |  |  |  ------------------
  |  |  |  |               #define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |  172|  24.5k|#define GM_ALPHA_PREC_BITS 15
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 4321|  24.5k|                       (1 << WARPEDMODEL_PREC_BITS);
  ------------------
  |  |   96|  24.5k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
 4322|  24.5k|    params->wmmat[3] = aom_rb_read_signed_primitive_refsubexpfin(
 4323|  24.5k|                           rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
  ------------------
  |  |  178|  24.5k|#define GM_ALPHA_MAX (1 << GM_ABS_ALPHA_BITS)
  |  |  ------------------
  |  |  |  |  173|  24.5k|#define GM_ABS_ALPHA_BITS 12
  |  |  ------------------
  ------------------
                                         rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
  ------------------
  |  |  163|  24.5k|#define SUBEXPFIN_K 3
  ------------------
 4324|  24.5k|                           (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF)) *
  ------------------
  |  |  174|  24.5k|#define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  24.5k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  ------------------
  |  |  |  |  172|  24.5k|#define GM_ALPHA_PREC_BITS 15
  |  |  ------------------
  ------------------
 4325|  24.5k|                       GM_ALPHA_DECODE_FACTOR;
  ------------------
  |  |  175|  24.5k|#define GM_ALPHA_DECODE_FACTOR (1 << GM_ALPHA_PREC_DIFF)
  |  |  ------------------
  |  |  |  |  174|  24.5k|#define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   96|  24.5k|#define WARPEDMODEL_PREC_BITS 16
  |  |  |  |  ------------------
  |  |  |  |               #define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |  172|  24.5k|#define GM_ALPHA_PREC_BITS 15
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 4326|  24.5k|  }
 4327|       |
 4328|   318k|  if (type >= AFFINE) {
  ------------------
  |  Branch (4328:7): [True: 6.02k, False: 312k]
  ------------------
 4329|  6.02k|    params->wmmat[4] = aom_rb_read_signed_primitive_refsubexpfin(
 4330|  6.02k|                           rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
  ------------------
  |  |  178|  6.02k|#define GM_ALPHA_MAX (1 << GM_ABS_ALPHA_BITS)
  |  |  ------------------
  |  |  |  |  173|  6.02k|#define GM_ABS_ALPHA_BITS 12
  |  |  ------------------
  ------------------
                                         rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
  ------------------
  |  |  163|  6.02k|#define SUBEXPFIN_K 3
  ------------------
 4331|  6.02k|                           (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF)) *
  ------------------
  |  |  174|  6.02k|#define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  6.02k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  ------------------
  |  |  |  |  172|  6.02k|#define GM_ALPHA_PREC_BITS 15
  |  |  ------------------
  ------------------
 4332|  6.02k|                       GM_ALPHA_DECODE_FACTOR;
  ------------------
  |  |  175|  6.02k|#define GM_ALPHA_DECODE_FACTOR (1 << GM_ALPHA_PREC_DIFF)
  |  |  ------------------
  |  |  |  |  174|  6.02k|#define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   96|  6.02k|#define WARPEDMODEL_PREC_BITS 16
  |  |  |  |  ------------------
  |  |  |  |               #define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |  172|  6.02k|#define GM_ALPHA_PREC_BITS 15
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 4333|  6.02k|    params->wmmat[5] = aom_rb_read_signed_primitive_refsubexpfin(
 4334|  6.02k|                           rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
  ------------------
  |  |  178|  6.02k|#define GM_ALPHA_MAX (1 << GM_ABS_ALPHA_BITS)
  |  |  ------------------
  |  |  |  |  173|  6.02k|#define GM_ABS_ALPHA_BITS 12
  |  |  ------------------
  ------------------
                                         rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
  ------------------
  |  |  163|  6.02k|#define SUBEXPFIN_K 3
  ------------------
 4335|  6.02k|                           (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
  ------------------
  |  |  174|  6.02k|#define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  6.02k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  ------------------
  |  |  |  |  172|  6.02k|#define GM_ALPHA_PREC_BITS 15
  |  |  ------------------
  ------------------
 4336|  6.02k|                               (1 << GM_ALPHA_PREC_BITS)) *
  ------------------
  |  |  172|  6.02k|#define GM_ALPHA_PREC_BITS 15
  ------------------
 4337|  6.02k|                           GM_ALPHA_DECODE_FACTOR +
  ------------------
  |  |  175|  6.02k|#define GM_ALPHA_DECODE_FACTOR (1 << GM_ALPHA_PREC_DIFF)
  |  |  ------------------
  |  |  |  |  174|  6.02k|#define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   96|  6.02k|#define WARPEDMODEL_PREC_BITS 16
  |  |  |  |  ------------------
  |  |  |  |               #define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |  172|  6.02k|#define GM_ALPHA_PREC_BITS 15
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 4338|  6.02k|                       (1 << WARPEDMODEL_PREC_BITS);
  ------------------
  |  |   96|  6.02k|#define WARPEDMODEL_PREC_BITS 16
  ------------------
 4339|   312k|  } else {
 4340|   312k|    params->wmmat[4] = -params->wmmat[3];
 4341|   312k|    params->wmmat[5] = params->wmmat[2];
 4342|   312k|  }
 4343|       |
 4344|   318k|  if (type >= TRANSLATION) {
  ------------------
  |  Branch (4344:7): [True: 27.6k, False: 290k]
  ------------------
 4345|  27.6k|    const int trans_bits = (type == TRANSLATION)
  ------------------
  |  Branch (4345:28): [True: 3.27k, False: 24.3k]
  ------------------
 4346|  27.6k|                               ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
  ------------------
  |  |  166|  3.27k|#define GM_ABS_TRANS_ONLY_BITS (GM_ABS_TRANS_BITS - GM_TRANS_PREC_BITS + 3)
  |  |  ------------------
  |  |  |  |  165|  3.27k|#define GM_ABS_TRANS_BITS 12
  |  |  ------------------
  |  |               #define GM_ABS_TRANS_ONLY_BITS (GM_ABS_TRANS_BITS - GM_TRANS_PREC_BITS + 3)
  |  |  ------------------
  |  |  |  |  164|  3.27k|#define GM_TRANS_PREC_BITS 6
  |  |  ------------------
  ------------------
 4347|  27.6k|                               : GM_ABS_TRANS_BITS;
  ------------------
  |  |  165|  24.3k|#define GM_ABS_TRANS_BITS 12
  ------------------
 4348|  27.6k|    const int trans_dec_factor =
 4349|  27.6k|        (type == TRANSLATION) ? GM_TRANS_ONLY_DECODE_FACTOR * (1 << !allow_hp)
  ------------------
  |  |  170|  3.27k|#define GM_TRANS_ONLY_DECODE_FACTOR (1 << GM_TRANS_ONLY_PREC_DIFF)
  |  |  ------------------
  |  |  |  |  168|  3.27k|#define GM_TRANS_ONLY_PREC_DIFF (WARPEDMODEL_PREC_BITS - 3)
  |  |  |  |  ------------------
  |  |  |  |  |  |   96|  3.27k|#define WARPEDMODEL_PREC_BITS 16
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  |  Branch (4349:9): [True: 3.27k, False: 24.3k]
  ------------------
 4350|  27.6k|                              : GM_TRANS_DECODE_FACTOR;
  ------------------
  |  |  169|  24.3k|#define GM_TRANS_DECODE_FACTOR (1 << GM_TRANS_PREC_DIFF)
  |  |  ------------------
  |  |  |  |  167|  24.3k|#define GM_TRANS_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_TRANS_PREC_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   96|  24.3k|#define WARPEDMODEL_PREC_BITS 16
  |  |  |  |  ------------------
  |  |  |  |               #define GM_TRANS_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_TRANS_PREC_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |  164|  24.3k|#define GM_TRANS_PREC_BITS 6
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 4351|  27.6k|    const int trans_prec_diff = (type == TRANSLATION)
  ------------------
  |  Branch (4351:33): [True: 3.27k, False: 24.3k]
  ------------------
 4352|  27.6k|                                    ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
  ------------------
  |  |  168|  3.27k|#define GM_TRANS_ONLY_PREC_DIFF (WARPEDMODEL_PREC_BITS - 3)
  |  |  ------------------
  |  |  |  |   96|  3.27k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  ------------------
 4353|  27.6k|                                    : GM_TRANS_PREC_DIFF;
  ------------------
  |  |  167|  24.3k|#define GM_TRANS_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_TRANS_PREC_BITS)
  |  |  ------------------
  |  |  |  |   96|  24.3k|#define WARPEDMODEL_PREC_BITS 16
  |  |  ------------------
  |  |               #define GM_TRANS_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_TRANS_PREC_BITS)
  |  |  ------------------
  |  |  |  |  164|  24.3k|#define GM_TRANS_PREC_BITS 6
  |  |  ------------------
  ------------------
 4354|  27.6k|    params->wmmat[0] = aom_rb_read_signed_primitive_refsubexpfin(
 4355|  27.6k|                           rb, (1 << trans_bits) + 1, SUBEXPFIN_K,
  ------------------
  |  |  163|  27.6k|#define SUBEXPFIN_K 3
  ------------------
 4356|  27.6k|                           (ref_params->wmmat[0] >> trans_prec_diff)) *
 4357|  27.6k|                       trans_dec_factor;
 4358|  27.6k|    params->wmmat[1] = aom_rb_read_signed_primitive_refsubexpfin(
 4359|  27.6k|                           rb, (1 << trans_bits) + 1, SUBEXPFIN_K,
  ------------------
  |  |  163|  27.6k|#define SUBEXPFIN_K 3
  ------------------
 4360|  27.6k|                           (ref_params->wmmat[1] >> trans_prec_diff)) *
 4361|  27.6k|                       trans_dec_factor;
 4362|  27.6k|  }
 4363|       |
 4364|   318k|  int good_shear_params = av1_get_shear_params(params);
 4365|   318k|  if (!good_shear_params) return 0;
  ------------------
  |  Branch (4365:7): [True: 9.88k, False: 308k]
  ------------------
 4366|       |
 4367|   308k|  return 1;
 4368|   318k|}
decodeframe.c:read_film_grain:
 4073|   180k|                                   struct aom_read_bit_buffer *rb) {
 4074|   180k|  if (cm->seq_params->film_grain_params_present &&
  ------------------
  |  Branch (4074:7): [True: 120k, False: 59.9k]
  ------------------
 4075|   180k|      (cm->show_frame || cm->showable_frame)) {
  ------------------
  |  Branch (4075:8): [True: 61.2k, False: 59.0k]
  |  Branch (4075:26): [True: 10.7k, False: 48.2k]
  ------------------
 4076|  72.0k|    read_film_grain_params(cm, rb);
 4077|   108k|  } else {
 4078|   108k|    memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
 4079|   108k|  }
 4080|   180k|  cm->film_grain_params.bit_depth = cm->seq_params->bit_depth;
 4081|   180k|  cm->cur_frame->film_grain_params = cm->film_grain_params;
 4082|   180k|}
decodeframe.c:read_film_grain_params:
 3908|  72.0k|                                   struct aom_read_bit_buffer *rb) {
 3909|  72.0k|  aom_film_grain_t *pars = &cm->film_grain_params;
 3910|  72.0k|  const SequenceHeader *const seq_params = cm->seq_params;
 3911|       |
 3912|  72.0k|  pars->apply_grain = aom_rb_read_bit(rb);
 3913|  72.0k|  if (!pars->apply_grain) {
  ------------------
  |  Branch (3913:7): [True: 47.6k, False: 24.3k]
  ------------------
 3914|  47.6k|    memset(pars, 0, sizeof(*pars));
 3915|  47.6k|    return;
 3916|  47.6k|  }
 3917|       |
 3918|  24.3k|  pars->random_seed = aom_rb_read_literal(rb, 16);
 3919|  24.3k|  if (cm->current_frame.frame_type == INTER_FRAME)
  ------------------
  |  Branch (3919:7): [True: 1.72k, False: 22.6k]
  ------------------
 3920|  1.72k|    pars->update_parameters = aom_rb_read_bit(rb);
 3921|  22.6k|  else
 3922|  22.6k|    pars->update_parameters = 1;
 3923|       |
 3924|  24.3k|  pars->bit_depth = seq_params->bit_depth;
 3925|       |
 3926|  24.3k|  if (!pars->update_parameters) {
  ------------------
  |  Branch (3926:7): [True: 1.66k, False: 22.7k]
  ------------------
 3927|       |    // inherit parameters from a previous reference frame
 3928|  1.66k|    int film_grain_params_ref_idx = aom_rb_read_literal(rb, 3);
 3929|       |    // Section 6.8.20: It is a requirement of bitstream conformance that
 3930|       |    // film_grain_params_ref_idx is equal to ref_frame_idx[ j ] for some value
 3931|       |    // of j in the range 0 to REFS_PER_FRAME - 1.
 3932|  1.66k|    int found = 0;
 3933|  10.3k|    for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
  ------------------
  |  Branch (3933:21): [True: 9.46k, False: 935]
  ------------------
 3934|  9.46k|      if (film_grain_params_ref_idx == cm->remapped_ref_idx[i]) {
  ------------------
  |  Branch (3934:11): [True: 726, False: 8.73k]
  ------------------
 3935|    726|        found = 1;
 3936|    726|        break;
 3937|    726|      }
 3938|  9.46k|    }
 3939|  1.66k|    if (!found) {
  ------------------
  |  Branch (3939:9): [True: 935, False: 726]
  ------------------
 3940|    935|      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
 3941|    935|                         "Invalid film grain reference idx %d. ref_frame_idx = "
 3942|    935|                         "{%d, %d, %d, %d, %d, %d, %d}",
 3943|    935|                         film_grain_params_ref_idx, cm->remapped_ref_idx[0],
 3944|    935|                         cm->remapped_ref_idx[1], cm->remapped_ref_idx[2],
 3945|    935|                         cm->remapped_ref_idx[3], cm->remapped_ref_idx[4],
 3946|    935|                         cm->remapped_ref_idx[5], cm->remapped_ref_idx[6]);
 3947|    935|    }
 3948|  1.66k|    RefCntBuffer *const buf = cm->ref_frame_map[film_grain_params_ref_idx];
 3949|  1.66k|    if (buf == NULL) {
  ------------------
  |  Branch (3949:9): [True: 0, False: 1.66k]
  ------------------
 3950|      0|      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
 3951|      0|                         "Invalid Film grain reference idx");
 3952|      0|    }
 3953|  1.66k|    if (!buf->film_grain_params_present) {
  ------------------
  |  Branch (3953:9): [True: 77, False: 1.58k]
  ------------------
 3954|     77|      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
 3955|     77|                         "Film grain reference parameters not available");
 3956|     77|    }
 3957|  1.66k|    uint16_t random_seed = pars->random_seed;
 3958|  1.66k|    *pars = buf->film_grain_params;   // inherit paramaters
 3959|  1.66k|    pars->random_seed = random_seed;  // with new random seed
 3960|  1.66k|    return;
 3961|  1.66k|  }
 3962|       |
 3963|       |  // Scaling functions parameters
 3964|  22.7k|  pars->num_y_points = aom_rb_read_literal(rb, 4);  // max 14
 3965|  22.7k|  if (pars->num_y_points > 14)
  ------------------
  |  Branch (3965:7): [True: 169, False: 22.5k]
  ------------------
 3966|    169|    aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
 3967|    169|                       "Number of points for film grain luma scaling function "
 3968|    169|                       "exceeds the maximum value.");
 3969|  54.8k|  for (int i = 0; i < pars->num_y_points; i++) {
  ------------------
  |  Branch (3969:19): [True: 32.0k, False: 22.7k]
  ------------------
 3970|  32.0k|    pars->scaling_points_y[i][0] = aom_rb_read_literal(rb, 8);
 3971|  32.0k|    if (i && pars->scaling_points_y[i - 1][0] >= pars->scaling_points_y[i][0])
  ------------------
  |  Branch (3971:9): [True: 15.2k, False: 16.8k]
  |  Branch (3971:14): [True: 950, False: 14.2k]
  ------------------
 3972|    950|      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
 3973|    950|                         "First coordinate of the scaling function points "
 3974|    950|                         "shall be increasing.");
 3975|  32.0k|    pars->scaling_points_y[i][1] = aom_rb_read_literal(rb, 8);
 3976|  32.0k|  }
 3977|       |
 3978|  22.7k|  if (!seq_params->monochrome)
  ------------------
  |  Branch (3978:7): [True: 20.8k, False: 1.84k]
  ------------------
 3979|  20.8k|    pars->chroma_scaling_from_luma = aom_rb_read_bit(rb);
 3980|  1.84k|  else
 3981|  1.84k|    pars->chroma_scaling_from_luma = 0;
 3982|       |
 3983|  22.7k|  if (seq_params->monochrome || pars->chroma_scaling_from_luma ||
  ------------------
  |  Branch (3983:7): [True: 1.84k, False: 20.8k]
  |  Branch (3983:33): [True: 10.1k, False: 10.7k]
  ------------------
 3984|  22.7k|      ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) &&
  ------------------
  |  Branch (3984:8): [True: 4.32k, False: 6.39k]
  |  Branch (3984:44): [True: 3.94k, False: 379]
  ------------------
 3985|  12.4k|       (pars->num_y_points == 0))) {
  ------------------
  |  Branch (3985:8): [True: 2.00k, False: 1.94k]
  ------------------
 3986|  12.4k|    pars->num_cb_points = 0;
 3987|  12.4k|    pars->num_cr_points = 0;
 3988|  12.4k|  } else {
 3989|  10.3k|    pars->num_cb_points = aom_rb_read_literal(rb, 4);  // max 10
 3990|  10.3k|    if (pars->num_cb_points > 10)
  ------------------
  |  Branch (3990:9): [True: 303, False: 10.0k]
  ------------------
 3991|    303|      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
 3992|    303|                         "Number of points for film grain cb scaling function "
 3993|    303|                         "exceeds the maximum value.");
 3994|  16.2k|    for (int i = 0; i < pars->num_cb_points; i++) {
  ------------------
  |  Branch (3994:21): [True: 5.91k, False: 10.3k]
  ------------------
 3995|  5.91k|      pars->scaling_points_cb[i][0] = aom_rb_read_literal(rb, 8);
 3996|  5.91k|      if (i &&
  ------------------
  |  Branch (3996:11): [True: 3.24k, False: 2.67k]
  ------------------
 3997|  5.91k|          pars->scaling_points_cb[i - 1][0] >= pars->scaling_points_cb[i][0])
  ------------------
  |  Branch (3997:11): [True: 296, False: 2.94k]
  ------------------
 3998|    296|        aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
 3999|    296|                           "First coordinate of the scaling function points "
 4000|    296|                           "shall be increasing.");
 4001|  5.91k|      pars->scaling_points_cb[i][1] = aom_rb_read_literal(rb, 8);
 4002|  5.91k|    }
 4003|       |
 4004|  10.3k|    pars->num_cr_points = aom_rb_read_literal(rb, 4);  // max 10
 4005|  10.3k|    if (pars->num_cr_points > 10)
  ------------------
  |  Branch (4005:9): [True: 136, False: 10.1k]
  ------------------
 4006|    136|      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
 4007|    136|                         "Number of points for film grain cr scaling function "
 4008|    136|                         "exceeds the maximum value.");
 4009|  28.4k|    for (int i = 0; i < pars->num_cr_points; i++) {
  ------------------
  |  Branch (4009:21): [True: 18.1k, False: 10.3k]
  ------------------
 4010|  18.1k|      pars->scaling_points_cr[i][0] = aom_rb_read_literal(rb, 8);
 4011|  18.1k|      if (i &&
  ------------------
  |  Branch (4011:11): [True: 12.2k, False: 5.95k]
  ------------------
 4012|  18.1k|          pars->scaling_points_cr[i - 1][0] >= pars->scaling_points_cr[i][0])
  ------------------
  |  Branch (4012:11): [True: 457, False: 11.7k]
  ------------------
 4013|    457|        aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
 4014|    457|                           "First coordinate of the scaling function points "
 4015|    457|                           "shall be increasing.");
 4016|  18.1k|      pars->scaling_points_cr[i][1] = aom_rb_read_literal(rb, 8);
 4017|  18.1k|    }
 4018|       |
 4019|  10.3k|    if ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) &&
  ------------------
  |  Branch (4019:9): [True: 1.84k, False: 8.45k]
  |  Branch (4019:45): [True: 1.60k, False: 239]
  ------------------
 4020|  10.3k|        (((pars->num_cb_points == 0) && (pars->num_cr_points != 0)) ||
  ------------------
  |  Branch (4020:11): [True: 24, False: 1.58k]
  |  Branch (4020:41): [True: 3, False: 21]
  ------------------
 4021|  1.60k|         ((pars->num_cb_points != 0) && (pars->num_cr_points == 0))))
  ------------------
  |  Branch (4021:11): [True: 1.58k, False: 21]
  |  Branch (4021:41): [True: 13, False: 1.57k]
  ------------------
 4022|     16|      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
 4023|     16|                         "In YCbCr 4:2:0, film grain shall be applied "
 4024|     16|                         "to both chroma components or neither.");
 4025|  10.3k|  }
 4026|       |
 4027|  22.7k|  pars->scaling_shift = aom_rb_read_literal(rb, 2) + 8;  // 8 + value
 4028|       |
 4029|       |  // AR coefficients
 4030|       |  // Only sent if the corresponsing scaling function has
 4031|       |  // more than 0 points
 4032|       |
 4033|  22.7k|  pars->ar_coeff_lag = aom_rb_read_literal(rb, 2);
 4034|       |
 4035|  22.7k|  int num_pos_luma = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
 4036|  22.7k|  int num_pos_chroma = num_pos_luma;
 4037|  22.7k|  if (pars->num_y_points > 0) ++num_pos_chroma;
  ------------------
  |  Branch (4037:7): [True: 14.9k, False: 7.80k]
  ------------------
 4038|       |
 4039|  22.7k|  if (pars->num_y_points)
  ------------------
  |  Branch (4039:7): [True: 14.9k, False: 7.80k]
  ------------------
 4040|  48.0k|    for (int i = 0; i < num_pos_luma; i++)
  ------------------
  |  Branch (4040:21): [True: 33.1k, False: 14.9k]
  ------------------
 4041|  33.1k|      pars->ar_coeffs_y[i] = aom_rb_read_literal(rb, 8) - 128;
 4042|       |
 4043|  22.7k|  if (pars->num_cb_points || pars->chroma_scaling_from_luma)
  ------------------
  |  Branch (4043:7): [True: 5.19k, False: 17.5k]
  |  Branch (4043:30): [True: 10.1k, False: 7.38k]
  ------------------
 4044|  52.7k|    for (int i = 0; i < num_pos_chroma; i++)
  ------------------
  |  Branch (4044:21): [True: 40.3k, False: 12.3k]
  ------------------
 4045|  40.3k|      pars->ar_coeffs_cb[i] = aom_rb_read_literal(rb, 8) - 128;
 4046|       |
 4047|  22.7k|  if (pars->num_cr_points || pars->chroma_scaling_from_luma)
  ------------------
  |  Branch (4047:7): [True: 8.67k, False: 14.0k]
  |  Branch (4047:30): [True: 9.76k, False: 4.28k]
  ------------------
 4048|  55.9k|    for (int i = 0; i < num_pos_chroma; i++)
  ------------------
  |  Branch (4048:21): [True: 40.8k, False: 15.1k]
  ------------------
 4049|  40.8k|      pars->ar_coeffs_cr[i] = aom_rb_read_literal(rb, 8) - 128;
 4050|       |
 4051|  22.7k|  pars->ar_coeff_shift = aom_rb_read_literal(rb, 2) + 6;  // 6 + value
 4052|       |
 4053|  22.7k|  pars->grain_scale_shift = aom_rb_read_literal(rb, 2);
 4054|       |
 4055|  22.7k|  if (pars->num_cb_points) {
  ------------------
  |  Branch (4055:7): [True: 2.25k, False: 20.4k]
  ------------------
 4056|  2.25k|    pars->cb_mult = aom_rb_read_literal(rb, 8);
 4057|  2.25k|    pars->cb_luma_mult = aom_rb_read_literal(rb, 8);
 4058|  2.25k|    pars->cb_offset = aom_rb_read_literal(rb, 9);
 4059|  2.25k|  }
 4060|       |
 4061|  22.7k|  if (pars->num_cr_points) {
  ------------------
  |  Branch (4061:7): [True: 5.34k, False: 17.3k]
  ------------------
 4062|  5.34k|    pars->cr_mult = aom_rb_read_literal(rb, 8);
 4063|  5.34k|    pars->cr_luma_mult = aom_rb_read_literal(rb, 8);
 4064|  5.34k|    pars->cr_offset = aom_rb_read_literal(rb, 9);
 4065|  5.34k|  }
 4066|       |
 4067|  22.7k|  pars->overlap_flag = aom_rb_read_bit(rb);
 4068|       |
 4069|  22.7k|  pars->clip_to_restricted_range = aom_rb_read_bit(rb);
 4070|  22.7k|}
decodeframe.c:read_ext_tile_info:
 2203|  21.3k|                                      struct aom_read_bit_buffer *const rb) {
 2204|  21.3k|  AV1_COMMON *const cm = &pbi->common;
 2205|       |
 2206|       |  // This information is stored as a separate byte.
 2207|  21.3k|  int mod = rb->bit_offset % CHAR_BIT;
 2208|  21.3k|  if (mod > 0) aom_rb_read_literal(rb, CHAR_BIT - mod);
  ------------------
  |  Branch (2208:7): [True: 18.4k, False: 2.91k]
  ------------------
 2209|  21.3k|  assert(rb->bit_offset % CHAR_BIT == 0);
 2210|       |
 2211|  21.3k|  if (cm->tiles.cols * cm->tiles.rows > 1) {
  ------------------
  |  Branch (2211:7): [True: 7.28k, False: 14.0k]
  ------------------
 2212|       |    // Read the number of bytes used to store tile size
 2213|  7.28k|    pbi->tile_col_size_bytes = aom_rb_read_literal(rb, 2) + 1;
 2214|  7.28k|    pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1;
 2215|  7.28k|  }
 2216|  21.3k|}
decodeframe.c:setup_frame_info:
 5255|   152k|static inline void setup_frame_info(AV1Decoder *pbi) {
 5256|   152k|  AV1_COMMON *const cm = &pbi->common;
 5257|       |
 5258|   152k|  if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
  ------------------
  |  Branch (5258:7): [True: 24.7k, False: 128k]
  ------------------
 5259|   152k|      cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
  ------------------
  |  Branch (5259:7): [True: 3.62k, False: 124k]
  ------------------
 5260|   152k|      cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
  ------------------
  |  Branch (5260:7): [True: 1.86k, False: 122k]
  ------------------
 5261|  30.2k|    av1_alloc_restoration_buffers(cm, /*is_sgr_enabled =*/true);
 5262|   111k|    for (int p = 0; p < av1_num_planes(cm); p++) {
  ------------------
  |  Branch (5262:21): [True: 81.7k, False: 30.2k]
  ------------------
 5263|  81.7k|      av1_alloc_restoration_struct(cm, &cm->rst_info[p], p > 0);
 5264|  81.7k|    }
 5265|  30.2k|  }
 5266|       |
 5267|   152k|  const int use_highbd = cm->seq_params->use_highbitdepth;
 5268|   152k|  const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
  ------------------
  |  |   84|   152k|  (((MAX_SB_SIZE)*2 + (AOM_INTERP_EXTEND)*2) * \
  |  |  ------------------
  |  |  |  |   32|   152k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|   152k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((MAX_SB_SIZE)*2 + (AOM_INTERP_EXTEND)*2) * \
  |  |  ------------------
  |  |  |  |   31|   152k|#define AOM_INTERP_EXTEND 4
  |  |  ------------------
  |  |   85|   152k|   ((MAX_SB_SIZE)*2 + (AOM_INTERP_EXTEND)*2))
  |  |  ------------------
  |  |  |  |   32|   152k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|   152k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                  ((MAX_SB_SIZE)*2 + (AOM_INTERP_EXTEND)*2))
  |  |  ------------------
  |  |  |  |   31|   152k|#define AOM_INTERP_EXTEND 4
  |  |  ------------------
  ------------------
 5269|   152k|  if (pbi->td.mc_buf_size != buf_size) {
  ------------------
  |  Branch (5269:7): [True: 17.0k, False: 135k]
  ------------------
 5270|  17.0k|    av1_free_mc_tmp_buf(&pbi->td);
 5271|  17.0k|    allocate_mc_tmp_buf(cm, &pbi->td, buf_size, use_highbd);
 5272|  17.0k|  }
 5273|   152k|}
decodeframe.c:allocate_mc_tmp_buf:
 3402|   171k|                                       int use_highbd) {
 3403|   513k|  for (int ref = 0; ref < 2; ref++) {
  ------------------
  |  Branch (3403:21): [True: 342k, False: 171k]
  ------------------
 3404|       |    // The mc_buf/hbd_mc_buf must be zeroed to fix a intermittent valgrind error
 3405|       |    // 'Conditional jump or move depends on uninitialised value' from the loop
 3406|       |    // filter. Uninitialized reads in convolve function (e.g. horiz_4tap path in
 3407|       |    // av1_convolve_2d_sr_avx2()) from mc_buf/hbd_mc_buf are seen to be the
 3408|       |    // potential reason for this issue.
 3409|   342k|    if (use_highbd) {
  ------------------
  |  Branch (3409:9): [True: 170k, False: 171k]
  ------------------
 3410|   170k|      uint16_t *hbd_mc_buf;
 3411|   170k|      CHECK_MEM_ERROR(cm, hbd_mc_buf, (uint16_t *)aom_memalign(16, buf_size));
  ------------------
  |  |   51|   170k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|   170k|  do {                                                    \
  |  |  |  |   69|   170k|    lval = (expr);                                        \
  |  |  |  |   70|   170k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 170k]
  |  |  |  |  ------------------
  |  |  |  |   71|   170k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|   170k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3412|   170k|      memset(hbd_mc_buf, 0, buf_size);
 3413|   170k|      thread_data->mc_buf[ref] = CONVERT_TO_BYTEPTR(hbd_mc_buf);
  ------------------
  |  |   76|   170k|#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
  ------------------
 3414|   171k|    } else {
 3415|   171k|      CHECK_MEM_ERROR(cm, thread_data->mc_buf[ref],
  ------------------
  |  |   51|   171k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|   171k|  do {                                                    \
  |  |  |  |   69|   171k|    lval = (expr);                                        \
  |  |  |  |   70|   171k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 171k]
  |  |  |  |  ------------------
  |  |  |  |   71|   171k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|   171k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3416|   171k|                      (uint8_t *)aom_memalign(16, buf_size));
 3417|   171k|      memset(thread_data->mc_buf[ref], 0, buf_size);
 3418|   171k|    }
 3419|   342k|  }
 3420|   171k|  thread_data->mc_buf_size = buf_size;
 3421|   171k|  thread_data->mc_buf_use_highbd = use_highbd;
 3422|       |
 3423|   171k|  CHECK_MEM_ERROR(cm, thread_data->tmp_conv_dst,
  ------------------
  |  |   51|   171k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|   171k|  do {                                                    \
  |  |  |  |   69|   171k|    lval = (expr);                                        \
  |  |  |  |   70|   171k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 171k]
  |  |  |  |  ------------------
  |  |  |  |   71|   171k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|   171k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3424|   171k|                  aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
 3425|   171k|                                       sizeof(*thread_data->tmp_conv_dst)));
 3426|   171k|  CHECK_MEM_ERROR(cm, thread_data->seg_mask,
  ------------------
  |  |   51|   171k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|   171k|  do {                                                    \
  |  |  |  |   69|   171k|    lval = (expr);                                        \
  |  |  |  |   70|   171k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 171k]
  |  |  |  |  ------------------
  |  |  |  |   71|   171k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|   171k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3427|   171k|                  (uint8_t *)aom_memalign(
 3428|   171k|                      16, 2 * MAX_SB_SQUARE * sizeof(*thread_data->seg_mask)));
 3429|       |
 3430|   513k|  for (int i = 0; i < 2; ++i) {
  ------------------
  |  Branch (3430:19): [True: 342k, False: 171k]
  ------------------
 3431|   342k|    CHECK_MEM_ERROR(
  ------------------
  |  |   51|   342k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|   342k|  do {                                                    \
  |  |  |  |   69|   342k|    lval = (expr);                                        \
  |  |  |  |   70|   342k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 342k]
  |  |  |  |  ------------------
  |  |  |  |   71|   342k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|   342k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3432|   342k|        cm, thread_data->tmp_obmc_bufs[i],
 3433|   342k|        aom_memalign(16, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
 3434|   342k|                             sizeof(*thread_data->tmp_obmc_bufs[i])));
 3435|   342k|  }
 3436|   171k|}
decodeframe.c:decode_tiles_row_mt:
 3750|  73.1k|                                          int start_tile, int end_tile) {
 3751|  73.1k|  AV1_COMMON *const cm = &pbi->common;
 3752|  73.1k|  CommonTileParams *const tiles = &cm->tiles;
 3753|  73.1k|  const int tile_cols = tiles->cols;
 3754|  73.1k|  const int tile_rows = tiles->rows;
 3755|  73.1k|  const int n_tiles = tile_cols * tile_rows;
 3756|  73.1k|  TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
 3757|  73.1k|  const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
  ------------------
  |  |   34|  73.1k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 73.1k, False: 0]
  |  |  ------------------
  ------------------
 3758|  73.1k|  const int single_row = pbi->dec_tile_row >= 0;
 3759|  73.1k|  const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
  ------------------
  |  |   34|  73.1k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 73.1k, False: 0]
  |  |  ------------------
  ------------------
 3760|  73.1k|  const int single_col = pbi->dec_tile_col >= 0;
 3761|  73.1k|  int tile_rows_start;
 3762|  73.1k|  int tile_rows_end;
 3763|  73.1k|  int tile_cols_start;
 3764|  73.1k|  int tile_cols_end;
 3765|  73.1k|  int tile_count_tg;
 3766|  73.1k|  int num_workers = 0;
 3767|  73.1k|  int max_threads;
 3768|  73.1k|  const uint8_t *raw_data_end = NULL;
 3769|  73.1k|  int max_sb_rows = 0;
 3770|       |
 3771|  73.1k|  if (tiles->large_scale) {
  ------------------
  |  Branch (3771:7): [True: 7.91k, False: 65.2k]
  ------------------
 3772|  7.91k|    tile_rows_start = single_row ? dec_tile_row : 0;
  ------------------
  |  Branch (3772:23): [True: 0, False: 7.91k]
  ------------------
 3773|  7.91k|    tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
  ------------------
  |  Branch (3773:21): [True: 0, False: 7.91k]
  ------------------
 3774|  7.91k|    tile_cols_start = single_col ? dec_tile_col : 0;
  ------------------
  |  Branch (3774:23): [True: 0, False: 7.91k]
  ------------------
 3775|  7.91k|    tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
  ------------------
  |  Branch (3775:21): [True: 0, False: 7.91k]
  ------------------
 3776|  65.2k|  } else {
 3777|  65.2k|    tile_rows_start = 0;
 3778|  65.2k|    tile_rows_end = tile_rows;
 3779|  65.2k|    tile_cols_start = 0;
 3780|  65.2k|    tile_cols_end = tile_cols;
 3781|  65.2k|  }
 3782|  73.1k|  tile_count_tg = end_tile - start_tile + 1;
 3783|  73.1k|  max_threads = pbi->max_threads;
 3784|       |
 3785|       |  // No tiles to decode.
 3786|  73.1k|  if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start ||
  ------------------
  |  Branch (3786:7): [True: 0, False: 73.1k]
  |  Branch (3786:43): [True: 0, False: 73.1k]
  ------------------
 3787|       |      // First tile is larger than end_tile.
 3788|  73.1k|      tile_rows_start * tile_cols + tile_cols_start > end_tile ||
  ------------------
  |  Branch (3788:7): [True: 0, False: 73.1k]
  ------------------
 3789|       |      // Last tile is smaller than start_tile.
 3790|  73.1k|      (tile_rows_end - 1) * tile_cols + tile_cols_end - 1 < start_tile)
  ------------------
  |  Branch (3790:7): [True: 0, False: 73.1k]
  ------------------
 3791|      0|    return data;
 3792|       |
 3793|  73.1k|  assert(tile_rows <= MAX_TILE_ROWS);
 3794|  73.1k|  assert(tile_cols <= MAX_TILE_COLS);
 3795|  73.1k|  assert(tile_count_tg > 0);
 3796|  73.1k|  assert(max_threads > 0);
 3797|  73.1k|  assert(start_tile <= end_tile);
 3798|  73.1k|  assert(start_tile >= 0 && end_tile < n_tiles);
 3799|       |
 3800|  73.1k|  (void)tile_count_tg;
 3801|       |
 3802|  73.1k|  decode_mt_init(pbi);
 3803|       |
 3804|       |  // get tile size in tile group
 3805|  73.1k|#if EXT_TILE_DEBUG
 3806|  73.1k|  if (tiles->large_scale) assert(pbi->ext_tile_debug == 1);
  ------------------
  |  Branch (3806:7): [True: 7.91k, False: 65.2k]
  ------------------
 3807|  73.1k|  if (tiles->large_scale)
  ------------------
  |  Branch (3807:7): [True: 7.91k, False: 65.2k]
  ------------------
 3808|  7.91k|    raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
 3809|  65.2k|  else
 3810|  65.2k|#endif  // EXT_TILE_DEBUG
 3811|  65.2k|    get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile);
 3812|       |
 3813|  73.1k|  if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
  ------------------
  |  Branch (3813:7): [True: 8.38k, False: 64.7k]
  |  Branch (3813:33): [True: 798, False: 63.9k]
  ------------------
 3814|  4.12k|    if (pbi->tile_data != NULL) {
  ------------------
  |  Branch (3814:9): [True: 798, False: 3.32k]
  ------------------
 3815|  3.95k|      for (int i = 0; i < pbi->allocated_tiles; i++) {
  ------------------
  |  Branch (3815:23): [True: 3.15k, False: 798]
  ------------------
 3816|  3.15k|        TileDataDec *const tile_data = pbi->tile_data + i;
 3817|  3.15k|        av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync);
 3818|  3.15k|      }
 3819|    798|    }
 3820|  4.12k|    decoder_alloc_tile_data(pbi, n_tiles);
 3821|  4.12k|  }
 3822|  73.1k|  if (pbi->dcb.xd.seg_mask == NULL)
  ------------------
  |  Branch (3822:7): [True: 3.32k, False: 69.7k]
  ------------------
 3823|  73.1k|    CHECK_MEM_ERROR(cm, pbi->dcb.xd.seg_mask,
  ------------------
  |  |   51|  3.32k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  3.32k|  do {                                                    \
  |  |  |  |   69|  3.32k|    lval = (expr);                                        \
  |  |  |  |   70|  3.32k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 3.32k]
  |  |  |  |  ------------------
  |  |  |  |   71|  3.32k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  3.32k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3824|  73.1k|                    (uint8_t *)aom_memalign(
 3825|  73.1k|                        16, 2 * MAX_SB_SQUARE * sizeof(*pbi->dcb.xd.seg_mask)));
 3826|       |
 3827|   158k|  for (int row = 0; row < tile_rows; row++) {
  ------------------
  |  Branch (3827:21): [True: 85.2k, False: 73.1k]
  ------------------
 3828|   176k|    for (int col = 0; col < tile_cols; col++) {
  ------------------
  |  Branch (3828:23): [True: 91.6k, False: 85.2k]
  ------------------
 3829|  91.6k|      TileDataDec *tile_data = pbi->tile_data + row * tiles->cols + col;
 3830|  91.6k|      av1_tile_init(&tile_data->tile_info, cm, row, col);
 3831|       |
 3832|  91.6k|      max_sb_rows = AOMMAX(max_sb_rows,
  ------------------
  |  |   35|  91.6k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 1.04k, False: 90.6k]
  |  |  ------------------
  ------------------
 3833|  91.6k|                           av1_get_sb_rows_in_tile(cm, &tile_data->tile_info));
 3834|  91.6k|      num_workers += get_max_row_mt_workers_per_tile(cm, &tile_data->tile_info);
 3835|  91.6k|    }
 3836|  85.2k|  }
 3837|  73.1k|  num_workers = AOMMIN(num_workers, max_threads);
  ------------------
  |  |   34|  73.1k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 67.4k, False: 5.69k]
  |  |  ------------------
  ------------------
 3838|       |
 3839|  73.1k|  if (pbi->allocated_row_mt_sync_rows != max_sb_rows) {
  ------------------
  |  Branch (3839:7): [True: 7.72k, False: 65.4k]
  ------------------
 3840|  35.2k|    for (int i = 0; i < n_tiles; ++i) {
  ------------------
  |  Branch (3840:21): [True: 27.5k, False: 7.72k]
  ------------------
 3841|  27.5k|      TileDataDec *const tile_data = pbi->tile_data + i;
 3842|  27.5k|      av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync);
 3843|  27.5k|      dec_row_mt_alloc(&tile_data->dec_row_mt_sync, cm, max_sb_rows);
 3844|  27.5k|    }
 3845|  7.72k|    pbi->allocated_row_mt_sync_rows = max_sb_rows;
 3846|  7.72k|  }
 3847|       |
 3848|  73.1k|  tile_mt_queue(pbi, tile_cols, tile_rows, tile_rows_start, tile_rows_end,
 3849|  73.1k|                tile_cols_start, tile_cols_end, start_tile, end_tile);
 3850|       |
 3851|  73.1k|  dec_alloc_cb_buf(pbi);
 3852|       |
 3853|  73.1k|  row_mt_frame_init(pbi, tile_rows_start, tile_rows_end, tile_cols_start,
 3854|  73.1k|                    tile_cols_end, start_tile, end_tile, max_sb_rows);
 3855|       |
 3856|  73.1k|  reset_dec_workers(pbi, row_mt_worker_hook, num_workers);
 3857|  73.1k|  launch_dec_workers(pbi, data_end, num_workers);
 3858|  73.1k|  sync_dec_workers(pbi, num_workers);
 3859|       |
 3860|  73.1k|  if (pbi->dcb.corrupted)
  ------------------
  |  Branch (3860:7): [True: 7.70k, False: 65.4k]
  ------------------
 3861|  7.70k|    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 3862|  7.70k|                       "Failed to decode tile data");
 3863|       |
 3864|  73.1k|  if (tiles->large_scale) {
  ------------------
  |  Branch (3864:7): [True: 3.25k, False: 69.8k]
  ------------------
 3865|  3.25k|    if (n_tiles == 1) {
  ------------------
  |  Branch (3865:9): [True: 3.24k, False: 8]
  ------------------
 3866|       |      // Find the end of the single tile buffer
 3867|  3.24k|      return aom_reader_find_end(&pbi->tile_data->bit_reader);
 3868|  3.24k|    }
 3869|       |    // Return the end of the last tile buffer
 3870|      8|    return raw_data_end;
 3871|  3.25k|  }
 3872|  69.8k|  TileDataDec *const tile_data = pbi->tile_data + end_tile;
 3873|       |
 3874|  69.8k|  return aom_reader_find_end(&tile_data->bit_reader);
 3875|  73.1k|}
decodeframe.c:decode_mt_init:
 3501|  73.1k|static inline void decode_mt_init(AV1Decoder *pbi) {
 3502|  73.1k|  AV1_COMMON *const cm = &pbi->common;
 3503|  73.1k|  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
 3504|  73.1k|  int worker_idx;
 3505|       |
 3506|       |  // Create workers and thread_data
 3507|  73.1k|  if (pbi->num_workers == 0) {
  ------------------
  |  Branch (3507:7): [True: 3.52k, False: 69.6k]
  ------------------
 3508|  3.52k|    const int num_threads = pbi->max_threads;
 3509|  3.52k|    CHECK_MEM_ERROR(cm, pbi->tile_workers,
  ------------------
  |  |   51|  3.52k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  3.52k|  do {                                                    \
  |  |  |  |   69|  3.52k|    lval = (expr);                                        \
  |  |  |  |   70|  3.52k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 3.52k]
  |  |  |  |  ------------------
  |  |  |  |   71|  3.52k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  3.52k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3510|  3.52k|                    aom_malloc(num_threads * sizeof(*pbi->tile_workers)));
 3511|  3.52k|    CHECK_MEM_ERROR(cm, pbi->thread_data,
  ------------------
  |  |   51|  3.52k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  3.52k|  do {                                                    \
  |  |  |  |   69|  3.52k|    lval = (expr);                                        \
  |  |  |  |   70|  3.52k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 3.52k]
  |  |  |  |  ------------------
  |  |  |  |   71|  3.52k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  3.52k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3512|  3.52k|                    aom_calloc(num_threads, sizeof(*pbi->thread_data)));
 3513|       |
 3514|   133k|    for (worker_idx = 0; worker_idx < num_threads; ++worker_idx) {
  ------------------
  |  Branch (3514:26): [True: 129k, False: 3.52k]
  ------------------
 3515|   129k|      AVxWorker *const worker = &pbi->tile_workers[worker_idx];
 3516|   129k|      DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
 3517|       |
 3518|   129k|      winterface->init(worker);
 3519|   129k|      worker->thread_name = "aom tile worker";
 3520|   129k|      if (worker_idx != 0 && !winterface->reset(worker)) {
  ------------------
  |  Branch (3520:11): [True: 125k, False: 3.52k]
  |  Branch (3520:30): [True: 0, False: 125k]
  ------------------
 3521|      0|        aom_internal_error(&pbi->error, AOM_CODEC_ERROR,
 3522|      0|                           "Tile decoder thread creation failed");
 3523|      0|      }
 3524|   129k|      ++pbi->num_workers;
 3525|       |
 3526|   129k|      if (worker_idx != 0) {
  ------------------
  |  Branch (3526:11): [True: 125k, False: 3.52k]
  ------------------
 3527|       |        // Allocate thread data.
 3528|   125k|        CHECK_MEM_ERROR(cm, thread_data->td,
  ------------------
  |  |   51|   125k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|   125k|  do {                                                    \
  |  |  |  |   69|   125k|    lval = (expr);                                        \
  |  |  |  |   70|   125k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 125k]
  |  |  |  |  ------------------
  |  |  |  |   71|   125k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|   125k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3529|   125k|                        aom_memalign(32, sizeof(*thread_data->td)));
 3530|   125k|        av1_zero(*thread_data->td);
  ------------------
  |  |   43|   125k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
 3531|   125k|      } else {
 3532|       |        // Main thread acts as a worker and uses the thread data in pbi
 3533|  3.52k|        thread_data->td = &pbi->td;
 3534|  3.52k|      }
 3535|   129k|      thread_data->error_info.error_code = AOM_CODEC_OK;
 3536|   129k|      thread_data->error_info.setjmp = 0;
 3537|   129k|    }
 3538|  3.52k|  }
 3539|  73.1k|  const int use_highbd = cm->seq_params->use_highbitdepth;
 3540|  73.1k|  const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
  ------------------
  |  |   84|  73.1k|  (((MAX_SB_SIZE)*2 + (AOM_INTERP_EXTEND)*2) * \
  |  |  ------------------
  |  |  |  |   32|  73.1k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|  73.1k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (((MAX_SB_SIZE)*2 + (AOM_INTERP_EXTEND)*2) * \
  |  |  ------------------
  |  |  |  |   31|  73.1k|#define AOM_INTERP_EXTEND 4
  |  |  ------------------
  |  |   85|  73.1k|   ((MAX_SB_SIZE)*2 + (AOM_INTERP_EXTEND)*2))
  |  |  ------------------
  |  |  |  |   32|  73.1k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|  73.1k|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                  ((MAX_SB_SIZE)*2 + (AOM_INTERP_EXTEND)*2))
  |  |  ------------------
  |  |  |  |   31|  73.1k|#define AOM_INTERP_EXTEND 4
  |  |  ------------------
  ------------------
 3541|  2.95M|  for (worker_idx = 1; worker_idx < pbi->max_threads; ++worker_idx) {
  ------------------
  |  Branch (3541:24): [True: 2.88M, False: 73.1k]
  ------------------
 3542|  2.88M|    DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
 3543|  2.88M|    if (thread_data->td->mc_buf_size != buf_size) {
  ------------------
  |  Branch (3543:9): [True: 153k, False: 2.73M]
  ------------------
 3544|   153k|      av1_free_mc_tmp_buf(thread_data->td);
 3545|   153k|      allocate_mc_tmp_buf(cm, thread_data->td, buf_size, use_highbd);
 3546|   153k|    }
 3547|  2.88M|  }
 3548|  73.1k|}
decodeframe.c:get_ls_tile_buffers:
 2288|  20.8k|    TileBufferDec (*const tile_buffers)[MAX_TILE_COLS]) {
 2289|  20.8k|  AV1_COMMON *const cm = &pbi->common;
 2290|  20.8k|  const int tile_cols = cm->tiles.cols;
 2291|  20.8k|  const int tile_rows = cm->tiles.rows;
 2292|  20.8k|  const int have_tiles = tile_cols * tile_rows > 1;
 2293|  20.8k|  const uint8_t *raw_data_end;  // The end of the last tile buffer
 2294|       |
 2295|  20.8k|  if (!have_tiles) {
  ------------------
  |  Branch (2295:7): [True: 14.0k, False: 6.79k]
  ------------------
 2296|  14.0k|    const size_t tile_size = data_end - data;
 2297|  14.0k|    tile_buffers[0][0].data = data;
 2298|  14.0k|    tile_buffers[0][0].size = tile_size;
 2299|  14.0k|    raw_data_end = NULL;
 2300|  14.0k|  } else {
 2301|       |    // We locate only the tile buffers that are required, which are the ones
 2302|       |    // specified by pbi->dec_tile_col and pbi->dec_tile_row. Also, we always
 2303|       |    // need the last (bottom right) tile buffer, as we need to know where the
 2304|       |    // end of the compressed frame buffer is for proper superframe decoding.
 2305|       |
 2306|  6.79k|    const uint8_t *tile_col_data_end[MAX_TILE_COLS] = { NULL };
 2307|  6.79k|    const uint8_t *const data_start = data;
 2308|       |
 2309|  6.79k|    const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
  ------------------
  |  |   34|  6.79k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 6.79k, False: 0]
  |  |  ------------------
  ------------------
 2310|  6.79k|    const int single_row = pbi->dec_tile_row >= 0;
 2311|  6.79k|    const int tile_rows_start = single_row ? dec_tile_row : 0;
  ------------------
  |  Branch (2311:33): [True: 0, False: 6.79k]
  ------------------
 2312|  6.79k|    const int tile_rows_end = single_row ? tile_rows_start + 1 : tile_rows;
  ------------------
  |  Branch (2312:31): [True: 0, False: 6.79k]
  ------------------
 2313|  6.79k|    const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
  ------------------
  |  |   34|  6.79k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 6.79k, False: 0]
  |  |  ------------------
  ------------------
 2314|  6.79k|    const int single_col = pbi->dec_tile_col >= 0;
 2315|  6.79k|    const int tile_cols_start = single_col ? dec_tile_col : 0;
  ------------------
  |  Branch (2315:33): [True: 0, False: 6.79k]
  ------------------
 2316|  6.79k|    const int tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
  ------------------
  |  Branch (2316:31): [True: 0, False: 6.79k]
  ------------------
 2317|       |
 2318|  6.79k|    const int tile_col_size_bytes = pbi->tile_col_size_bytes;
 2319|  6.79k|    const int tile_size_bytes = pbi->tile_size_bytes;
 2320|  6.79k|    int tile_width, tile_height;
 2321|  6.79k|    if (!av1_get_uniform_tile_size(cm, &tile_width, &tile_height)) {
  ------------------
  |  Branch (2321:9): [True: 1.11k, False: 5.68k]
  ------------------
 2322|  1.11k|      aom_internal_error(
 2323|  1.11k|          &pbi->error, AOM_CODEC_CORRUPT_FRAME,
 2324|  1.11k|          "Not all the tiles in the tile list have the same size.");
 2325|  1.11k|    }
 2326|  6.79k|    const int tile_copy_mode =
 2327|  6.79k|        ((AOMMAX(tile_width, tile_height) << MI_SIZE_LOG2) <= 256) ? 1 : 0;
  ------------------
  |  |   35|  6.79k|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 1.52k, False: 5.27k]
  |  |  ------------------
  ------------------
                      ((AOMMAX(tile_width, tile_height) << MI_SIZE_LOG2) <= 256) ? 1 : 0;
  ------------------
  |  |   39|  6.79k|#define MI_SIZE_LOG2 2
  ------------------
  |  Branch (2327:9): [True: 1.64k, False: 5.15k]
  ------------------
 2328|       |    // Read tile column sizes for all columns (we need the last tile buffer)
 2329|  13.4k|    for (int c = 0; c < tile_cols; ++c) {
  ------------------
  |  Branch (2329:21): [True: 6.63k, False: 6.79k]
  ------------------
 2330|  6.63k|      const int is_last = c == tile_cols - 1;
 2331|  6.63k|      size_t tile_col_size;
 2332|       |
 2333|  6.63k|      if (!is_last) {
  ------------------
  |  Branch (2333:11): [True: 4.59k, False: 2.04k]
  ------------------
 2334|  4.59k|        if (tile_col_size_bytes > data_end - data) {
  ------------------
  |  Branch (2334:13): [True: 67, False: 4.53k]
  ------------------
 2335|     67|          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 2336|     67|                             "Not enough data to read tile_col_size");
 2337|     67|        }
 2338|  4.59k|        tile_col_size = mem_get_varsize(data, tile_col_size_bytes);
 2339|  4.59k|        data += tile_col_size_bytes;
 2340|  4.59k|        if (tile_col_size > (size_t)(data_end - data)) {
  ------------------
  |  Branch (2340:13): [True: 3.57k, False: 1.01k]
  ------------------
 2341|  3.57k|          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 2342|  3.57k|                             "tile_col_data_end[%d] is out of bound", c);
 2343|  3.57k|        }
 2344|  4.59k|        tile_col_data_end[c] = data + tile_col_size;
 2345|  4.59k|      } else {
 2346|  2.04k|        tile_col_size = data_end - data;
 2347|  2.04k|        tile_col_data_end[c] = data_end;
 2348|  2.04k|      }
 2349|  6.63k|      data += tile_col_size;
 2350|  6.63k|    }
 2351|       |
 2352|  6.79k|    data = data_start;
 2353|       |
 2354|       |    // Read the required tile sizes.
 2355|  8.85k|    for (int c = tile_cols_start; c < tile_cols_end; ++c) {
  ------------------
  |  Branch (2355:35): [True: 2.05k, False: 6.79k]
  ------------------
 2356|  2.05k|      const int is_last = c == tile_cols - 1;
 2357|       |
 2358|  2.05k|      if (c > 0) data = tile_col_data_end[c - 1];
  ------------------
  |  Branch (2358:11): [True: 14, False: 2.04k]
  ------------------
 2359|       |
 2360|  2.05k|      if (!is_last) data += tile_col_size_bytes;
  ------------------
  |  Branch (2360:11): [True: 538, False: 1.51k]
  ------------------
 2361|       |
 2362|       |      // Get the whole of the last column, otherwise stop at the required tile.
 2363|  21.1k|      for (int r = 0; r < (is_last ? tile_rows : tile_rows_end); ++r) {
  ------------------
  |  Branch (2363:23): [True: 19.0k, False: 2.05k]
  |  Branch (2363:28): [True: 19.2k, False: 1.93k]
  ------------------
 2364|  19.0k|        get_ls_tile_buffer(tile_col_data_end[c], &pbi->error, &data,
 2365|  19.0k|                           tile_buffers, tile_size_bytes, c, r, tile_copy_mode);
 2366|  19.0k|      }
 2367|  2.05k|    }
 2368|       |
 2369|       |    // If we have not read the last column, then read it to get the last tile.
 2370|  6.79k|    if (tile_cols_end != tile_cols) {
  ------------------
  |  Branch (2370:9): [True: 0, False: 6.79k]
  ------------------
 2371|      0|      const int c = tile_cols - 1;
 2372|       |
 2373|      0|      data = tile_col_data_end[c - 1];
 2374|       |
 2375|      0|      for (int r = 0; r < tile_rows; ++r) {
  ------------------
  |  Branch (2375:23): [True: 0, False: 0]
  ------------------
 2376|      0|        get_ls_tile_buffer(tile_col_data_end[c], &pbi->error, &data,
 2377|      0|                           tile_buffers, tile_size_bytes, c, r, tile_copy_mode);
 2378|      0|      }
 2379|      0|    }
 2380|  6.79k|    raw_data_end = data;
 2381|  6.79k|  }
 2382|  20.8k|  return raw_data_end;
 2383|  20.8k|}
decodeframe.c:mem_get_varsize:
 2219|  37.3k|static size_t mem_get_varsize(const uint8_t *src, int sz) {
 2220|  37.3k|  switch (sz) {
 2221|  27.9k|    case 1: return src[0];
  ------------------
  |  Branch (2221:5): [True: 27.9k, False: 9.45k]
  ------------------
 2222|  7.03k|    case 2: return mem_get_le16(src);
  ------------------
  |  |  101|  7.03k|#define mem_get_le16 mem_ops_wrap_symbol(mem_get_le16)
  |  |  ------------------
  |  |  |  |   51|  7.03k|#define mem_ops_wrap_symbol(fn) mem_ops_wrap_symbol2(fn, MEM_VALUE_T)
  |  |  |  |  ------------------
  |  |  |  |  |  |   53|  7.03k|#define mem_ops_wrap_symbol2(fn, typ) mem_ops_wrap_symbol3(fn, typ)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   55|  7.03k|#define mem_ops_wrap_symbol3(fn, typ) fn##_as_##typ
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  |  Branch (2222:5): [True: 7.03k, False: 30.3k]
  ------------------
 2223|  1.37k|    case 3: return mem_get_le24(src);
  ------------------
  |  |  112|  1.37k|#define mem_get_le24 mem_ops_wrap_symbol(mem_get_le24)
  |  |  ------------------
  |  |  |  |   51|  1.37k|#define mem_ops_wrap_symbol(fn) mem_ops_wrap_symbol2(fn, MEM_VALUE_T)
  |  |  |  |  ------------------
  |  |  |  |  |  |   53|  1.37k|#define mem_ops_wrap_symbol2(fn, typ) mem_ops_wrap_symbol3(fn, typ)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   55|  1.37k|#define mem_ops_wrap_symbol3(fn, typ) fn##_as_##typ
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  |  Branch (2223:5): [True: 1.37k, False: 35.9k]
  ------------------
 2224|  1.04k|    case 4: return mem_get_le32(src);
  ------------------
  |  |  124|  1.04k|#define mem_get_le32 mem_ops_wrap_symbol(mem_get_le32)
  |  |  ------------------
  |  |  |  |   51|  1.04k|#define mem_ops_wrap_symbol(fn) mem_ops_wrap_symbol2(fn, MEM_VALUE_T)
  |  |  |  |  ------------------
  |  |  |  |  |  |   53|  1.04k|#define mem_ops_wrap_symbol2(fn, typ) mem_ops_wrap_symbol3(fn, typ)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   55|  1.04k|#define mem_ops_wrap_symbol3(fn, typ) fn##_as_##typ
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  |  Branch (2224:5): [True: 1.04k, False: 36.3k]
  ------------------
 2225|      0|    default: assert(0 && "Invalid size"); return -1;
  ------------------
  |  Branch (2225:5): [True: 0, False: 37.3k]
  ------------------
 2226|  37.3k|  }
 2227|  37.3k|}
decodeframe.c:get_ls_tile_buffer:
 2236|  19.0k|    int tile_size_bytes, int col, int row, int tile_copy_mode) {
 2237|  19.0k|  size_t size;
 2238|       |
 2239|  19.0k|  size_t copy_size = 0;
 2240|  19.0k|  const uint8_t *copy_data = NULL;
 2241|       |
 2242|  19.0k|  if (!read_is_valid(*data, tile_size_bytes, data_end))
  ------------------
  |  Branch (2242:7): [True: 526, False: 18.5k]
  ------------------
 2243|    526|    aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
 2244|    526|                       "Truncated packet or corrupt tile length");
 2245|  19.0k|  size = mem_get_varsize(*data, tile_size_bytes);
 2246|       |
 2247|       |  // If tile_copy_mode = 1, then the top bit of the tile header indicates copy
 2248|       |  // mode.
 2249|  19.0k|  if (tile_copy_mode && (size >> (tile_size_bytes * 8 - 1)) == 1) {
  ------------------
  |  Branch (2249:7): [True: 17.9k, False: 1.16k]
  |  Branch (2249:25): [True: 12.9k, False: 4.96k]
  ------------------
 2250|       |    // The remaining bits in the top byte signal the row offset
 2251|  12.9k|    int offset = (size >> (tile_size_bytes - 1) * 8) & 0x7f;
 2252|  12.9k|    if (offset > row) {
  ------------------
  |  Branch (2252:9): [True: 452, False: 12.5k]
  ------------------
 2253|    452|      aom_internal_error(
 2254|    452|          error_info, AOM_CODEC_CORRUPT_FRAME,
 2255|    452|          "Invalid row offset in tile copy mode: row=%d offset=%d", row,
 2256|    452|          offset);
 2257|    452|    }
 2258|       |
 2259|       |    // Currently, only use tiles in same column as reference tiles.
 2260|  12.9k|    copy_data = tile_buffers[row - offset][col].data;
 2261|  12.9k|    copy_size = tile_buffers[row - offset][col].size;
 2262|  12.9k|    size = 0;
 2263|  12.9k|  } else {
 2264|  6.12k|    size += AV1_MIN_TILE_SIZE_BYTES;
  ------------------
  |  |   55|  6.12k|#define AV1_MIN_TILE_SIZE_BYTES 1
  ------------------
 2265|  6.12k|  }
 2266|       |
 2267|  19.0k|  *data += tile_size_bytes;
 2268|       |
 2269|  19.0k|  if (size > (size_t)(data_end - *data))
  ------------------
  |  Branch (2269:7): [True: 392, False: 18.7k]
  ------------------
 2270|    392|    aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
 2271|    392|                       "Truncated packet or corrupt tile size");
 2272|       |
 2273|  19.0k|  if (size > 0) {
  ------------------
  |  Branch (2273:7): [True: 5.20k, False: 13.8k]
  ------------------
 2274|  5.20k|    tile_buffers[row][col].data = *data;
 2275|  5.20k|    tile_buffers[row][col].size = size;
 2276|  13.8k|  } else {
 2277|  13.8k|    tile_buffers[row][col].data = copy_data;
 2278|  13.8k|    tile_buffers[row][col].size = copy_size;
 2279|  13.8k|  }
 2280|       |
 2281|  19.0k|  *data += size;
 2282|  19.0k|}
decodeframe.c:read_is_valid:
  135|   196k|static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) {
  136|   196k|  return len != 0 && len <= (size_t)(end - start);
  ------------------
  |  Branch (136:10): [True: 196k, False: 654]
  |  Branch (136:22): [True: 195k, False: 917]
  ------------------
  137|   196k|}
decodeframe.c:get_tile_buffers:
 2429|   131k|    int end_tile) {
 2430|   131k|  AV1_COMMON *const cm = &pbi->common;
 2431|   131k|  const int tile_cols = cm->tiles.cols;
 2432|   131k|  const int tile_rows = cm->tiles.rows;
 2433|   131k|  int tc = 0;
 2434|       |
 2435|   266k|  for (int r = 0; r < tile_rows; ++r) {
  ------------------
  |  Branch (2435:19): [True: 134k, False: 131k]
  ------------------
 2436|   277k|    for (int c = 0; c < tile_cols; ++c, ++tc) {
  ------------------
  |  Branch (2436:21): [True: 143k, False: 134k]
  ------------------
 2437|   143k|      TileBufferDec *const buf = &tile_buffers[r][c];
 2438|       |
 2439|   143k|      const int is_last = (tc == end_tile);
 2440|   143k|      const size_t hdr_offset = 0;
 2441|       |
 2442|   143k|      if (tc < start_tile || tc > end_tile) continue;
  ------------------
  |  Branch (2442:11): [True: 0, False: 143k]
  |  Branch (2442:30): [True: 216, False: 143k]
  ------------------
 2443|       |
 2444|   143k|      if (data + hdr_offset >= data_end)
  ------------------
  |  Branch (2444:11): [True: 225, False: 143k]
  ------------------
 2445|    225|        aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 2446|    225|                           "Data ended before all tiles were read.");
 2447|   143k|      data += hdr_offset;
 2448|   143k|      get_tile_buffer(data_end, pbi->tile_size_bytes, is_last, &pbi->error,
 2449|   143k|                      &data, buf);
 2450|   143k|    }
 2451|   134k|  }
 2452|   131k|}
decodeframe.c:get_tile_buffer:
 2402|   143k|                                   TileBufferDec *const buf) {
 2403|   143k|  size_t size;
 2404|       |
 2405|   143k|  if (!is_last) {
  ------------------
  |  Branch (2405:7): [True: 14.6k, False: 128k]
  ------------------
 2406|  14.6k|    if (!read_is_valid(*data, tile_size_bytes, data_end))
  ------------------
  |  Branch (2406:9): [True: 387, False: 14.2k]
  ------------------
 2407|    387|      aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
 2408|    387|                         "Not enough data to read tile size");
 2409|       |
 2410|  14.6k|    size = mem_get_varsize(*data, tile_size_bytes) + AV1_MIN_TILE_SIZE_BYTES;
  ------------------
  |  |   55|  14.6k|#define AV1_MIN_TILE_SIZE_BYTES 1
  ------------------
 2411|  14.6k|    *data += tile_size_bytes;
 2412|       |
 2413|  14.6k|    if (size > (size_t)(data_end - *data))
  ------------------
  |  Branch (2413:9): [True: 2.65k, False: 11.9k]
  ------------------
 2414|  2.65k|      aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
 2415|  2.65k|                         "Truncated packet or corrupt tile size");
 2416|   128k|  } else {
 2417|   128k|    size = data_end - *data;
 2418|   128k|  }
 2419|       |
 2420|   143k|  buf->data = *data;
 2421|   143k|  buf->size = size;
 2422|       |
 2423|   143k|  *data += size;
 2424|   143k|}
decodeframe.c:decoder_alloc_tile_data:
 2476|  14.5k|static inline void decoder_alloc_tile_data(AV1Decoder *pbi, const int n_tiles) {
 2477|  14.5k|  AV1_COMMON *const cm = &pbi->common;
 2478|  14.5k|  aom_free(pbi->tile_data);
 2479|  14.5k|  pbi->allocated_tiles = 0;
 2480|  14.5k|  CHECK_MEM_ERROR(cm, pbi->tile_data,
  ------------------
  |  |   51|  14.5k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  14.5k|  do {                                                    \
  |  |  |  |   69|  14.5k|    lval = (expr);                                        \
  |  |  |  |   70|  14.5k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 14.5k]
  |  |  |  |  ------------------
  |  |  |  |   71|  14.5k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  14.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 2481|  14.5k|                  aom_memalign(32, n_tiles * sizeof(*pbi->tile_data)));
 2482|  14.5k|  pbi->allocated_tiles = n_tiles;
 2483|  50.4k|  for (int i = 0; i < n_tiles; i++) {
  ------------------
  |  Branch (2483:19): [True: 35.8k, False: 14.5k]
  ------------------
 2484|  35.8k|    TileDataDec *const tile_data = pbi->tile_data + i;
 2485|  35.8k|    av1_zero(tile_data->dec_row_mt_sync);
  ------------------
  |  |   43|  35.8k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
 2486|  35.8k|  }
 2487|  14.5k|  pbi->allocated_row_mt_sync_rows = 0;
 2488|  14.5k|}
decodeframe.c:get_max_row_mt_workers_per_tile:
 3013|   218k|                                                  const TileInfo *tile) {
 3014|       |  // NOTE: Currently value of max workers is calculated based
 3015|       |  // on the parse and decode time. As per the theoretical estimate
 3016|       |  // when percentage of parse time is equal to percentage of decode
 3017|       |  // time, number of workers needed to parse + decode a tile can not
 3018|       |  // exceed more than 2.
 3019|       |  // TODO(any): Modify this value if parsing is optimized in future.
 3020|   218k|  int sb_rows = av1_get_sb_rows_in_tile(cm, tile);
 3021|   218k|  int max_workers =
 3022|   218k|      sb_rows == 1 ? AOM_MIN_THREADS_PER_TILE : AOM_MAX_THREADS_PER_TILE;
  ------------------
  |  |   78|  83.6k|#define AOM_MIN_THREADS_PER_TILE 1
  ------------------
                    sb_rows == 1 ? AOM_MIN_THREADS_PER_TILE : AOM_MAX_THREADS_PER_TILE;
  ------------------
  |  |   79|   134k|#define AOM_MAX_THREADS_PER_TILE 2
  ------------------
  |  Branch (3022:7): [True: 83.6k, False: 134k]
  ------------------
 3023|   218k|  return max_workers;
 3024|   218k|}
decodeframe.c:dec_row_mt_alloc:
 2510|  27.5k|                                    AV1_COMMON *cm, int rows) {
 2511|  27.5k|  dec_row_mt_sync->allocated_sb_rows = rows;
 2512|  27.5k|#if CONFIG_MULTITHREAD
 2513|  27.5k|  {
 2514|  27.5k|    int i;
 2515|       |
 2516|  27.5k|    CHECK_MEM_ERROR(cm, dec_row_mt_sync->mutex_,
  ------------------
  |  |   51|  27.5k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  27.5k|  do {                                                    \
  |  |  |  |   69|  27.5k|    lval = (expr);                                        \
  |  |  |  |   70|  27.5k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 27.5k]
  |  |  |  |  ------------------
  |  |  |  |   71|  27.5k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 2517|  27.5k|                    aom_malloc(sizeof(*(dec_row_mt_sync->mutex_)) * rows));
 2518|  27.5k|    if (dec_row_mt_sync->mutex_) {
  ------------------
  |  Branch (2518:9): [True: 27.5k, False: 0]
  ------------------
 2519|  80.1k|      for (i = 0; i < rows; ++i) {
  ------------------
  |  Branch (2519:19): [True: 52.6k, False: 27.5k]
  ------------------
 2520|  52.6k|        pthread_mutex_init(&dec_row_mt_sync->mutex_[i], NULL);
 2521|  52.6k|      }
 2522|  27.5k|    }
 2523|       |
 2524|  27.5k|    CHECK_MEM_ERROR(cm, dec_row_mt_sync->cond_,
  ------------------
  |  |   51|  27.5k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  27.5k|  do {                                                    \
  |  |  |  |   69|  27.5k|    lval = (expr);                                        \
  |  |  |  |   70|  27.5k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 27.5k]
  |  |  |  |  ------------------
  |  |  |  |   71|  27.5k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 2525|  27.5k|                    aom_malloc(sizeof(*(dec_row_mt_sync->cond_)) * rows));
 2526|  27.5k|    if (dec_row_mt_sync->cond_) {
  ------------------
  |  Branch (2526:9): [True: 27.5k, False: 0]
  ------------------
 2527|  80.1k|      for (i = 0; i < rows; ++i) {
  ------------------
  |  Branch (2527:19): [True: 52.6k, False: 27.5k]
  ------------------
 2528|  52.6k|        pthread_cond_init(&dec_row_mt_sync->cond_[i], NULL);
 2529|  52.6k|      }
 2530|  27.5k|    }
 2531|  27.5k|  }
 2532|  27.5k|#endif  // CONFIG_MULTITHREAD
 2533|       |
 2534|  27.5k|  CHECK_MEM_ERROR(cm, dec_row_mt_sync->cur_sb_col,
  ------------------
  |  |   51|  27.5k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  27.5k|  do {                                                    \
  |  |  |  |   69|  27.5k|    lval = (expr);                                        \
  |  |  |  |   70|  27.5k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 27.5k]
  |  |  |  |  ------------------
  |  |  |  |   71|  27.5k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  27.5k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 2535|  27.5k|                  aom_malloc(sizeof(*(dec_row_mt_sync->cur_sb_col)) * rows));
 2536|       |
 2537|       |  // Set up nsync.
 2538|  27.5k|  dec_row_mt_sync->sync_range = get_sync_range(cm->width);
 2539|  27.5k|}
decodeframe.c:get_sync_range:
 2491|  27.5k|static inline int get_sync_range(int width) {
 2492|       |// nsync numbers are picked by testing.
 2493|       |#if 0
 2494|       |  if (width < 640)
 2495|       |    return 1;
 2496|       |  else if (width <= 1280)
 2497|       |    return 2;
 2498|       |  else if (width <= 4096)
 2499|       |    return 4;
 2500|       |  else
 2501|       |    return 8;
 2502|       |#else
 2503|  27.5k|  (void)width;
 2504|  27.5k|#endif
 2505|  27.5k|  return 1;
 2506|  27.5k|}
decodeframe.c:tile_mt_queue:
 3553|  68.0k|                                 int start_tile, int end_tile) {
 3554|  68.0k|  AV1_COMMON *const cm = &pbi->common;
 3555|  68.0k|  if (pbi->tile_mt_info.alloc_tile_cols != tile_cols ||
  ------------------
  |  Branch (3555:7): [True: 3.56k, False: 64.5k]
  ------------------
 3556|  68.0k|      pbi->tile_mt_info.alloc_tile_rows != tile_rows) {
  ------------------
  |  Branch (3556:7): [True: 564, False: 63.9k]
  ------------------
 3557|  4.12k|    av1_dealloc_dec_jobs(&pbi->tile_mt_info);
 3558|  4.12k|    alloc_dec_jobs(&pbi->tile_mt_info, cm, tile_rows, tile_cols);
 3559|  4.12k|  }
 3560|  68.0k|  enqueue_tile_jobs(pbi, cm, tile_rows_start, tile_rows_end, tile_cols_start,
 3561|  68.0k|                    tile_cols_end, start_tile, end_tile);
 3562|  68.0k|  qsort(pbi->tile_mt_info.job_queue, pbi->tile_mt_info.jobs_enqueued,
 3563|  68.0k|        sizeof(pbi->tile_mt_info.job_queue[0]), compare_tile_buffers);
 3564|  68.0k|}
decodeframe.c:alloc_dec_jobs:
 3360|  4.12k|                                  int tile_rows, int tile_cols) {
 3361|  4.12k|  tile_mt_info->alloc_tile_rows = tile_rows;
 3362|  4.12k|  tile_mt_info->alloc_tile_cols = tile_cols;
 3363|  4.12k|  int num_tiles = tile_rows * tile_cols;
 3364|  4.12k|#if CONFIG_MULTITHREAD
 3365|  4.12k|  {
 3366|  4.12k|    CHECK_MEM_ERROR(cm, tile_mt_info->job_mutex,
  ------------------
  |  |   51|  4.12k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  4.12k|  do {                                                    \
  |  |  |  |   69|  4.12k|    lval = (expr);                                        \
  |  |  |  |   70|  4.12k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 4.12k]
  |  |  |  |  ------------------
  |  |  |  |   71|  4.12k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  4.12k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3367|  4.12k|                    aom_malloc(sizeof(*tile_mt_info->job_mutex) * num_tiles));
 3368|       |
 3369|  28.0k|    for (int i = 0; i < num_tiles; i++) {
  ------------------
  |  Branch (3369:21): [True: 23.9k, False: 4.12k]
  ------------------
 3370|  23.9k|      pthread_mutex_init(&tile_mt_info->job_mutex[i], NULL);
 3371|  23.9k|    }
 3372|  4.12k|  }
 3373|  4.12k|#endif
 3374|  4.12k|  CHECK_MEM_ERROR(cm, tile_mt_info->job_queue,
  ------------------
  |  |   51|  4.12k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  4.12k|  do {                                                    \
  |  |  |  |   69|  4.12k|    lval = (expr);                                        \
  |  |  |  |   70|  4.12k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 4.12k]
  |  |  |  |  ------------------
  |  |  |  |   71|  4.12k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  4.12k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3375|  4.12k|                  aom_malloc(sizeof(*tile_mt_info->job_queue) * num_tiles));
 3376|  4.12k|}
decodeframe.c:enqueue_tile_jobs:
 3340|  68.0k|                                     int start_tile, int end_tile) {
 3341|  68.0k|  AV1DecTileMT *tile_mt_info = &pbi->tile_mt_info;
 3342|  68.0k|  TileJobsDec *tile_job_queue = tile_mt_info->job_queue;
 3343|  68.0k|  tile_mt_info->jobs_enqueued = 0;
 3344|  68.0k|  tile_mt_info->jobs_dequeued = 0;
 3345|       |
 3346|   153k|  for (int row = tile_rows_start; row < tile_rows_end; row++) {
  ------------------
  |  Branch (3346:35): [True: 85.2k, False: 68.0k]
  ------------------
 3347|   176k|    for (int col = tile_cols_start; col < tile_cols_end; col++) {
  ------------------
  |  Branch (3347:37): [True: 91.6k, False: 85.2k]
  ------------------
 3348|  91.6k|      if (row * cm->tiles.cols + col < start_tile ||
  ------------------
  |  Branch (3348:11): [True: 0, False: 91.6k]
  ------------------
 3349|  91.6k|          row * cm->tiles.cols + col > end_tile)
  ------------------
  |  Branch (3349:11): [True: 169, False: 91.5k]
  ------------------
 3350|    169|        continue;
 3351|  91.5k|      tile_job_queue->tile_buffer = &pbi->tile_buffers[row][col];
 3352|  91.5k|      tile_job_queue->tile_data = pbi->tile_data + row * cm->tiles.cols + col;
 3353|  91.5k|      tile_job_queue++;
 3354|  91.5k|      tile_mt_info->jobs_enqueued++;
 3355|  91.5k|    }
 3356|  85.2k|  }
 3357|  68.0k|}
decodeframe.c:compare_tile_buffers:
 3331|  77.3k|static int compare_tile_buffers(const void *a, const void *b) {
 3332|  77.3k|  const TileJobsDec *const buf1 = (const TileJobsDec *)a;
 3333|  77.3k|  const TileJobsDec *const buf2 = (const TileJobsDec *)b;
 3334|  77.3k|  return (((int)buf2->tile_buffer->size) - ((int)buf1->tile_buffer->size));
 3335|  77.3k|}
decodeframe.c:dec_alloc_cb_buf:
 3666|  68.0k|static inline void dec_alloc_cb_buf(AV1Decoder *pbi) {
 3667|  68.0k|  AV1_COMMON *const cm = &pbi->common;
 3668|  68.0k|  int size = ((cm->mi_params.mi_rows >> cm->seq_params->mib_size_log2) + 1) *
 3669|  68.0k|             ((cm->mi_params.mi_cols >> cm->seq_params->mib_size_log2) + 1);
 3670|       |
 3671|  68.0k|  if (pbi->cb_buffer_alloc_size < size) {
  ------------------
  |  Branch (3671:7): [True: 4.30k, False: 63.7k]
  ------------------
 3672|  4.30k|    av1_dec_free_cb_buf(pbi);
 3673|  4.30k|    CHECK_MEM_ERROR(cm, pbi->cb_buffer_base,
  ------------------
  |  |   51|  4.30k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  4.30k|  do {                                                    \
  |  |  |  |   69|  4.30k|    lval = (expr);                                        \
  |  |  |  |   70|  4.30k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 5, False: 4.30k]
  |  |  |  |  ------------------
  |  |  |  |   71|  4.30k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      5|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  4.30k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3674|  4.30k|                    aom_memalign(32, sizeof(*pbi->cb_buffer_base) * size));
 3675|  4.30k|    memset(pbi->cb_buffer_base, 0, sizeof(*pbi->cb_buffer_base) * size);
 3676|  4.30k|    pbi->cb_buffer_alloc_size = size;
 3677|  4.30k|  }
 3678|  68.0k|}
decodeframe.c:row_mt_frame_init:
 3683|  68.0k|                                     int end_tile, int max_sb_rows) {
 3684|  68.0k|  AV1_COMMON *const cm = &pbi->common;
 3685|  68.0k|  AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
 3686|       |
 3687|  68.0k|  frame_row_mt_info->tile_rows_start = tile_rows_start;
 3688|  68.0k|  frame_row_mt_info->tile_rows_end = tile_rows_end;
 3689|  68.0k|  frame_row_mt_info->tile_cols_start = tile_cols_start;
 3690|  68.0k|  frame_row_mt_info->tile_cols_end = tile_cols_end;
 3691|  68.0k|  frame_row_mt_info->start_tile = start_tile;
 3692|  68.0k|  frame_row_mt_info->end_tile = end_tile;
 3693|  68.0k|  frame_row_mt_info->mi_rows_to_decode = 0;
 3694|  68.0k|  frame_row_mt_info->mi_rows_parse_done = 0;
 3695|  68.0k|  frame_row_mt_info->mi_rows_decode_started = 0;
 3696|  68.0k|  frame_row_mt_info->row_mt_exit = 0;
 3697|       |
 3698|   153k|  for (int tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
  ------------------
  |  Branch (3698:40): [True: 85.2k, False: 68.0k]
  ------------------
 3699|   176k|    for (int tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
  ------------------
  |  Branch (3699:42): [True: 91.6k, False: 85.2k]
  ------------------
 3700|  91.6k|      if (tile_row * cm->tiles.cols + tile_col < start_tile ||
  ------------------
  |  Branch (3700:11): [True: 0, False: 91.6k]
  ------------------
 3701|  91.6k|          tile_row * cm->tiles.cols + tile_col > end_tile)
  ------------------
  |  Branch (3701:11): [True: 169, False: 91.5k]
  ------------------
 3702|    169|        continue;
 3703|       |
 3704|  91.5k|      TileDataDec *const tile_data =
 3705|  91.5k|          pbi->tile_data + tile_row * cm->tiles.cols + tile_col;
 3706|  91.5k|      const TileInfo *const tile_info = &tile_data->tile_info;
 3707|       |
 3708|  91.5k|      tile_data->dec_row_mt_sync.mi_rows_parse_done = 0;
 3709|  91.5k|      tile_data->dec_row_mt_sync.mi_rows_decode_started = 0;
 3710|  91.5k|      tile_data->dec_row_mt_sync.num_threads_working = 0;
 3711|  91.5k|      tile_data->dec_row_mt_sync.mi_rows =
 3712|  91.5k|          ALIGN_POWER_OF_TWO(tile_info->mi_row_end - tile_info->mi_row_start,
  ------------------
  |  |   69|  91.5k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
 3713|  91.5k|                             cm->seq_params->mib_size_log2);
 3714|  91.5k|      tile_data->dec_row_mt_sync.mi_cols =
 3715|  91.5k|          ALIGN_POWER_OF_TWO(tile_info->mi_col_end - tile_info->mi_col_start,
  ------------------
  |  |   69|  91.5k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
 3716|  91.5k|                             cm->seq_params->mib_size_log2);
 3717|  91.5k|      tile_data->dec_row_mt_sync.intrabc_extra_top_right_sb_delay =
 3718|  91.5k|          av1_get_intrabc_extra_top_right_sb_delay(cm);
 3719|       |
 3720|  91.5k|      frame_row_mt_info->mi_rows_to_decode +=
 3721|  91.5k|          tile_data->dec_row_mt_sync.mi_rows;
 3722|       |
 3723|       |      // Initialize cur_sb_col to -1 for all SB rows.
 3724|  91.5k|      memset(tile_data->dec_row_mt_sync.cur_sb_col, -1,
 3725|  91.5k|             sizeof(*tile_data->dec_row_mt_sync.cur_sb_col) * max_sb_rows);
 3726|  91.5k|    }
 3727|  85.2k|  }
 3728|       |
 3729|  68.0k|#if CONFIG_MULTITHREAD
 3730|  68.0k|  if (pbi->row_mt_mutex_ == NULL) {
  ------------------
  |  Branch (3730:7): [True: 3.32k, False: 64.7k]
  ------------------
 3731|  3.32k|    CHECK_MEM_ERROR(cm, pbi->row_mt_mutex_,
  ------------------
  |  |   51|  3.32k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  3.32k|  do {                                                    \
  |  |  |  |   69|  3.32k|    lval = (expr);                                        \
  |  |  |  |   70|  3.32k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 3.32k]
  |  |  |  |  ------------------
  |  |  |  |   71|  3.32k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  3.32k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3732|  3.32k|                    aom_malloc(sizeof(*(pbi->row_mt_mutex_))));
 3733|  3.32k|    if (pbi->row_mt_mutex_) {
  ------------------
  |  Branch (3733:9): [True: 3.32k, False: 0]
  ------------------
 3734|  3.32k|      pthread_mutex_init(pbi->row_mt_mutex_, NULL);
 3735|  3.32k|    }
 3736|  3.32k|  }
 3737|       |
 3738|  68.0k|  if (pbi->row_mt_cond_ == NULL) {
  ------------------
  |  Branch (3738:7): [True: 3.32k, False: 64.7k]
  ------------------
 3739|  3.32k|    CHECK_MEM_ERROR(cm, pbi->row_mt_cond_,
  ------------------
  |  |   51|  3.32k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  3.32k|  do {                                                    \
  |  |  |  |   69|  3.32k|    lval = (expr);                                        \
  |  |  |  |   70|  3.32k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 3.32k]
  |  |  |  |  ------------------
  |  |  |  |   71|  3.32k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  3.32k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 3740|  3.32k|                    aom_malloc(sizeof(*(pbi->row_mt_cond_))));
 3741|  3.32k|    if (pbi->row_mt_cond_) {
  ------------------
  |  Branch (3741:9): [True: 3.32k, False: 0]
  ------------------
 3742|  3.32k|      pthread_cond_init(pbi->row_mt_cond_, NULL);
 3743|  3.32k|    }
 3744|  3.32k|  }
 3745|  68.0k|#endif
 3746|  68.0k|}
decodeframe.c:reset_dec_workers:
 3439|  68.0k|                                     int num_workers) {
 3440|  68.0k|  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
 3441|       |
 3442|       |  // Reset tile decoding hook
 3443|   193k|  for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
  ------------------
  |  Branch (3443:28): [True: 125k, False: 68.0k]
  ------------------
 3444|   125k|    AVxWorker *const worker = &pbi->tile_workers[worker_idx];
 3445|   125k|    DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
 3446|   125k|    thread_data->td->dcb = pbi->dcb;
 3447|   125k|    thread_data->td->dcb.corrupted = 0;
 3448|   125k|    thread_data->td->dcb.mc_buf[0] = thread_data->td->mc_buf[0];
 3449|   125k|    thread_data->td->dcb.mc_buf[1] = thread_data->td->mc_buf[1];
 3450|   125k|    thread_data->td->dcb.xd.tmp_conv_dst = thread_data->td->tmp_conv_dst;
 3451|   125k|    if (worker_idx)
  ------------------
  |  Branch (3451:9): [True: 57.1k, False: 68.0k]
  ------------------
 3452|  57.1k|      thread_data->td->dcb.xd.seg_mask = thread_data->td->seg_mask;
 3453|   375k|    for (int j = 0; j < 2; ++j) {
  ------------------
  |  Branch (3453:21): [True: 250k, False: 125k]
  ------------------
 3454|   250k|      thread_data->td->dcb.xd.tmp_obmc_bufs[j] =
 3455|   250k|          thread_data->td->tmp_obmc_bufs[j];
 3456|   250k|    }
 3457|   125k|    winterface->sync(worker);
 3458|       |
 3459|   125k|    worker->hook = worker_hook;
 3460|   125k|    worker->data1 = thread_data;
 3461|   125k|    worker->data2 = pbi;
 3462|   125k|  }
 3463|       |#if CONFIG_ACCOUNTING
 3464|       |  if (pbi->acct_enabled) {
 3465|       |    aom_accounting_reset(&pbi->accounting);
 3466|       |  }
 3467|       |#endif
 3468|  68.0k|}
decodeframe.c:row_mt_worker_hook:
 3203|   125k|static int row_mt_worker_hook(void *arg1, void *arg2) {
 3204|   125k|  DecWorkerData *const thread_data = (DecWorkerData *)arg1;
 3205|   125k|  AV1Decoder *const pbi = (AV1Decoder *)arg2;
 3206|   125k|  ThreadData *const td = thread_data->td;
 3207|   125k|  uint8_t allow_update_cdf;
 3208|   125k|  AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
 3209|   125k|  td->dcb.corrupted = 0;
 3210|       |
 3211|       |  // The jmp_buf is valid only for the duration of the function that calls
 3212|       |  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
 3213|       |  // before it returns.
 3214|   125k|  if (setjmp(thread_data->error_info.jmp)) {
 3215|  2.12k|    thread_data->error_info.setjmp = 0;
 3216|  2.12k|    thread_data->td->dcb.corrupted = 1;
 3217|  2.12k|#if CONFIG_MULTITHREAD
 3218|  2.12k|    pthread_mutex_lock(pbi->row_mt_mutex_);
 3219|  2.12k|#endif
 3220|  2.12k|    frame_row_mt_info->row_mt_exit = 1;
 3221|  2.12k|#if CONFIG_MULTITHREAD
 3222|  2.12k|    pthread_cond_broadcast(pbi->row_mt_cond_);
 3223|  2.12k|    pthread_mutex_unlock(pbi->row_mt_mutex_);
 3224|  2.12k|#endif
 3225|       |    // If any SB row (erroneous row) processed by a thread encounters an
 3226|       |    // internal error, there is a need to indicate other threads that decoding
 3227|       |    // of the erroneous row is complete. This ensures that other threads which
 3228|       |    // wait upon the completion of SB's present in erroneous row are not waiting
 3229|       |    // indefinitely.
 3230|  2.12k|    signal_decoding_done_for_erroneous_row(pbi, &thread_data->td->dcb.xd);
 3231|  2.12k|    return 0;
 3232|  2.12k|  }
 3233|   123k|  thread_data->error_info.setjmp = 1;
 3234|       |
 3235|   123k|  AV1_COMMON *cm = &pbi->common;
 3236|   123k|  allow_update_cdf = cm->tiles.large_scale ? 0 : 1;
  ------------------
  |  Branch (3236:22): [True: 16.8k, False: 106k]
  ------------------
 3237|   123k|  allow_update_cdf = allow_update_cdf && !cm->features.disable_cdf_update;
  ------------------
  |  Branch (3237:22): [True: 108k, False: 14.9k]
  |  Branch (3237:42): [True: 92.0k, False: 16.0k]
  ------------------
 3238|       |
 3239|   123k|  set_decode_func_pointers(td, 0x1);
 3240|       |
 3241|   123k|  assert(cm->tiles.cols > 0);
 3242|   212k|  while (!td->dcb.corrupted) {
  ------------------
  |  Branch (3242:10): [True: 189k, False: 23.3k]
  ------------------
 3243|   189k|    TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info);
 3244|       |
 3245|   189k|    if (cur_job_info != NULL) {
  ------------------
  |  Branch (3245:9): [True: 87.6k, False: 101k]
  ------------------
 3246|  87.6k|      const TileBufferDec *const tile_buffer = cur_job_info->tile_buffer;
 3247|  87.6k|      TileDataDec *const tile_data = cur_job_info->tile_data;
 3248|  87.6k|      tile_worker_hook_init(pbi, thread_data, tile_buffer, tile_data,
 3249|  87.6k|                            allow_update_cdf);
 3250|  87.6k|#if CONFIG_MULTITHREAD
 3251|  87.6k|      pthread_mutex_lock(pbi->row_mt_mutex_);
 3252|  87.6k|#endif
 3253|  87.6k|      tile_data->dec_row_mt_sync.num_threads_working++;
 3254|  87.6k|#if CONFIG_MULTITHREAD
 3255|  87.6k|      pthread_mutex_unlock(pbi->row_mt_mutex_);
 3256|  87.6k|#endif
 3257|       |      // decode tile
 3258|  87.6k|      parse_tile_row_mt(pbi, td, tile_data);
 3259|  87.6k|#if CONFIG_MULTITHREAD
 3260|  87.6k|      pthread_mutex_lock(pbi->row_mt_mutex_);
 3261|  87.6k|#endif
 3262|  87.6k|      tile_data->dec_row_mt_sync.num_threads_working--;
 3263|  87.6k|#if CONFIG_MULTITHREAD
 3264|  87.6k|      pthread_mutex_unlock(pbi->row_mt_mutex_);
 3265|  87.6k|#endif
 3266|   101k|    } else {
 3267|   101k|      break;
 3268|   101k|    }
 3269|   189k|  }
 3270|       |
 3271|   124k|  if (td->dcb.corrupted) {
  ------------------
  |  Branch (3271:7): [True: 21.3k, False: 103k]
  ------------------
 3272|  21.3k|    thread_data->error_info.setjmp = 0;
 3273|  21.3k|#if CONFIG_MULTITHREAD
 3274|  21.3k|    pthread_mutex_lock(pbi->row_mt_mutex_);
 3275|  21.3k|#endif
 3276|  21.3k|    frame_row_mt_info->row_mt_exit = 1;
 3277|  21.3k|#if CONFIG_MULTITHREAD
 3278|  21.3k|    pthread_cond_broadcast(pbi->row_mt_cond_);
 3279|  21.3k|    pthread_mutex_unlock(pbi->row_mt_mutex_);
 3280|  21.3k|#endif
 3281|  21.3k|    return 0;
 3282|  21.3k|  }
 3283|       |
 3284|   103k|  set_decode_func_pointers(td, 0x2);
 3285|       |
 3286|   229k|  while (1) {
  ------------------
  |  Branch (3286:10): [Folded - Ignored]
  ------------------
 3287|   227k|    AV1DecRowMTJobInfo next_job_info;
 3288|   227k|    int end_of_frame = 0;
 3289|       |
 3290|   227k|#if CONFIG_MULTITHREAD
 3291|   227k|    pthread_mutex_lock(pbi->row_mt_mutex_);
 3292|   227k|#endif
 3293|   262k|    while (!get_next_job_info(pbi, &next_job_info, &end_of_frame)) {
  ------------------
  |  Branch (3293:12): [True: 35.2k, False: 227k]
  ------------------
 3294|  35.2k|#if CONFIG_MULTITHREAD
 3295|  35.2k|      pthread_cond_wait(pbi->row_mt_cond_, pbi->row_mt_mutex_);
 3296|  35.2k|#endif
 3297|  35.2k|    }
 3298|   227k|#if CONFIG_MULTITHREAD
 3299|   227k|    pthread_mutex_unlock(pbi->row_mt_mutex_);
 3300|   227k|#endif
 3301|       |
 3302|   227k|    if (end_of_frame) break;
  ------------------
  |  Branch (3302:9): [True: 101k, False: 125k]
  ------------------
 3303|       |
 3304|   125k|    int tile_row = next_job_info.tile_row;
 3305|   125k|    int tile_col = next_job_info.tile_col;
 3306|   125k|    int mi_row = next_job_info.mi_row;
 3307|       |
 3308|   125k|    TileDataDec *tile_data =
 3309|   125k|        pbi->tile_data + tile_row * cm->tiles.cols + tile_col;
 3310|   125k|    AV1DecRowMTSync *dec_row_mt_sync = &tile_data->dec_row_mt_sync;
 3311|       |
 3312|   125k|    av1_tile_init(&td->dcb.xd.tile, cm, tile_row, tile_col);
 3313|   125k|    av1_init_macroblockd(cm, &td->dcb.xd);
 3314|   125k|    td->dcb.xd.error_info = &thread_data->error_info;
 3315|       |
 3316|   125k|    decode_tile_sb_row(pbi, td, &tile_data->tile_info, mi_row);
 3317|       |
 3318|   125k|#if CONFIG_MULTITHREAD
 3319|   125k|    pthread_mutex_lock(pbi->row_mt_mutex_);
 3320|   125k|#endif
 3321|   125k|    dec_row_mt_sync->num_threads_working--;
 3322|   125k|#if CONFIG_MULTITHREAD
 3323|   125k|    pthread_mutex_unlock(pbi->row_mt_mutex_);
 3324|   125k|#endif
 3325|   125k|  }
 3326|   103k|  thread_data->error_info.setjmp = 0;
 3327|   103k|  return !td->dcb.corrupted;
 3328|   124k|}
decodeframe.c:signal_decoding_done_for_erroneous_row:
 2620|  2.12k|    AV1Decoder *const pbi, const MACROBLOCKD *const xd) {
 2621|  2.12k|  AV1_COMMON *const cm = &pbi->common;
 2622|  2.12k|  const TileInfo *const tile = &xd->tile;
 2623|  2.12k|  const int sb_row_in_tile =
 2624|  2.12k|      ((xd->mi_row - tile->mi_row_start) >> cm->seq_params->mib_size_log2);
 2625|  2.12k|  const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile);
 2626|  2.12k|  TileDataDec *const tile_data =
 2627|  2.12k|      pbi->tile_data + tile->tile_row * cm->tiles.cols + tile->tile_col;
 2628|  2.12k|  AV1DecRowMTSync *dec_row_mt_sync = &tile_data->dec_row_mt_sync;
 2629|       |
 2630|  2.12k|  sync_write(dec_row_mt_sync, sb_row_in_tile, sb_cols_in_tile - 1,
 2631|  2.12k|             sb_cols_in_tile);
 2632|  2.12k|}
decodeframe.c:sync_write:
 2590|   454k|                              int c, const int sb_cols) {
 2591|   454k|#if CONFIG_MULTITHREAD
 2592|   454k|  const int nsync = dec_row_mt_sync->sync_range;
 2593|   454k|  int cur;
 2594|   454k|  int sig = 1;
 2595|       |
 2596|   454k|  if (c < sb_cols - 1) {
  ------------------
  |  Branch (2596:7): [True: 326k, False: 128k]
  ------------------
 2597|   326k|    cur = c;
 2598|   326k|    if (c % nsync) sig = 0;
  ------------------
  |  Branch (2598:9): [True: 0, False: 326k]
  ------------------
 2599|   326k|  } else {
 2600|   128k|    cur = sb_cols + nsync + dec_row_mt_sync->intrabc_extra_top_right_sb_delay;
 2601|   128k|  }
 2602|       |
 2603|   454k|  if (sig) {
  ------------------
  |  Branch (2603:7): [True: 454k, False: 0]
  ------------------
 2604|   454k|    pthread_mutex_lock(&dec_row_mt_sync->mutex_[r]);
 2605|       |
 2606|   454k|    dec_row_mt_sync->cur_sb_col[r] = cur;
 2607|       |
 2608|   454k|    pthread_cond_signal(&dec_row_mt_sync->cond_[r]);
 2609|   454k|    pthread_mutex_unlock(&dec_row_mt_sync->mutex_[r]);
 2610|   454k|  }
 2611|       |#else
 2612|       |  (void)dec_row_mt_sync;
 2613|       |  (void)r;
 2614|       |  (void)c;
 2615|       |  (void)sb_cols;
 2616|       |#endif  // CONFIG_MULTITHREAD
 2617|   454k|}
decodeframe.c:set_decode_func_pointers:
 2698|   301k|                                            int parse_decode_flag) {
 2699|   301k|  td->read_coeffs_tx_intra_block_visit = decode_block_void;
 2700|   301k|  td->predict_and_recon_intra_block_visit = decode_block_void;
 2701|   301k|  td->read_coeffs_tx_inter_block_visit = decode_block_void;
 2702|   301k|  td->inverse_tx_inter_block_visit = decode_block_void;
 2703|   301k|  td->predict_inter_block_visit = predict_inter_block_void;
 2704|   301k|  td->cfl_store_inter_block_visit = cfl_store_inter_block_void;
 2705|       |
 2706|   301k|  if (parse_decode_flag & 0x1) {
  ------------------
  |  Branch (2706:7): [True: 200k, False: 101k]
  ------------------
 2707|   200k|    td->read_coeffs_tx_intra_block_visit = read_coeffs_tx_intra_block;
 2708|   200k|    td->read_coeffs_tx_inter_block_visit = av1_read_coeffs_txb;
 2709|   200k|  }
 2710|   301k|  if (parse_decode_flag & 0x2) {
  ------------------
  |  Branch (2710:7): [True: 177k, False: 124k]
  ------------------
 2711|   177k|    td->predict_and_recon_intra_block_visit =
 2712|   177k|        predict_and_reconstruct_intra_block;
 2713|   177k|    td->inverse_tx_inter_block_visit = inverse_transform_inter_block;
 2714|   177k|    td->predict_inter_block_visit = predict_inter_block;
 2715|   177k|    td->cfl_store_inter_block_visit = cfl_store_inter_block;
 2716|   177k|  }
 2717|   301k|}
decodeframe.c:decode_block_void:
  190|   172M|                                     const TX_SIZE tx_size) {
  191|   172M|  (void)cm;
  192|   172M|  (void)dcb;
  193|   172M|  (void)r;
  194|   172M|  (void)plane;
  195|   172M|  (void)row;
  196|   172M|  (void)col;
  197|   172M|  (void)tx_size;
  198|   172M|}
decodeframe.c:predict_inter_block_void:
  202|  3.69M|                                            BLOCK_SIZE bsize) {
  203|  3.69M|  (void)cm;
  204|  3.69M|  (void)dcb;
  205|  3.69M|  (void)bsize;
  206|  3.69M|}
decodeframe.c:cfl_store_inter_block_void:
  209|  3.68M|                                              MACROBLOCKD *const xd) {
  210|  3.68M|  (void)cm;
  211|  3.68M|  (void)xd;
  212|  3.68M|}
decodeframe.c:read_coeffs_tx_intra_block:
  169|   120M|    const int plane, const int row, const int col, const TX_SIZE tx_size) {
  170|   120M|  MB_MODE_INFO *mbmi = dcb->xd.mi[0];
  171|   120M|  if (!mbmi->skip_txfm) {
  ------------------
  |  Branch (171:7): [True: 25.4M, False: 95.1M]
  ------------------
  172|       |#if TXCOEFF_TIMER
  173|       |    struct aom_usec_timer timer;
  174|       |    aom_usec_timer_start(&timer);
  175|       |#endif
  176|  25.4M|    av1_read_coeffs_txb(cm, dcb, r, plane, row, col, tx_size);
  177|       |#if TXCOEFF_TIMER
  178|       |    aom_usec_timer_mark(&timer);
  179|       |    const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
  180|       |    cm->txcoeff_timer += elapsed_time;
  181|       |    ++cm->txb_count;
  182|       |#endif
  183|  25.4M|  }
  184|   120M|}
decodeframe.c:predict_and_reconstruct_intra_block:
  216|   112M|    const int plane, const int row, const int col, const TX_SIZE tx_size) {
  217|   112M|  (void)r;
  218|   112M|  MACROBLOCKD *const xd = &dcb->xd;
  219|   112M|  MB_MODE_INFO *mbmi = xd->mi[0];
  220|   112M|  PLANE_TYPE plane_type = get_plane_type(plane);
  221|       |
  222|   112M|  av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
  223|       |
  224|   112M|  if (!mbmi->skip_txfm) {
  ------------------
  |  Branch (224:7): [True: 20.2M, False: 92.3M]
  ------------------
  225|  20.2M|    eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane];
  226|  20.2M|    if (eob_data->eob) {
  ------------------
  |  Branch (226:9): [True: 10.1M, False: 10.1M]
  ------------------
  227|  10.1M|      const bool reduced_tx_set_used = cm->features.reduced_tx_set_used;
  228|       |      // tx_type was read out in av1_read_coeffs_txb.
  229|  10.1M|      const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, row, col, tx_size,
  230|  10.1M|                                              reduced_tx_set_used);
  231|  10.1M|      struct macroblockd_plane *const pd = &xd->plane[plane];
  232|  10.1M|      uint8_t *dst = &pd->dst.buf[(row * pd->dst.stride + col) << MI_SIZE_LOG2];
  ------------------
  |  |   39|  10.1M|#define MI_SIZE_LOG2 2
  ------------------
  233|  10.1M|      inverse_transform_block(dcb, plane, tx_type, tx_size, dst, pd->dst.stride,
  234|  10.1M|                              reduced_tx_set_used);
  235|  10.1M|    }
  236|  20.2M|  }
  237|   112M|  if (plane == AOM_PLANE_Y && store_cfl_required(cm, xd)) {
  ------------------
  |  |  226|   225M|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  |  Branch (237:7): [True: 39.0M, False: 73.4M]
  |  Branch (237:31): [True: 2.28M, False: 36.8M]
  ------------------
  238|  2.28M|    cfl_store_tx(xd, row, col, tx_size, mbmi->bsize);
  239|  2.28M|  }
  240|   112M|}
decodeframe.c:inverse_transform_block:
  157|  19.3M|                                           int stride, int reduced_tx_set) {
  158|  19.3M|  tran_low_t *const dqcoeff = dcb->dqcoeff_block[plane] + dcb->cb_offset[plane];
  159|  19.3M|  eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane];
  160|  19.3M|  uint16_t scan_line = eob_data->max_scan_line;
  161|  19.3M|  uint16_t eob = eob_data->eob;
  162|  19.3M|  av1_inverse_transform_block(&dcb->xd, dqcoeff, plane, tx_type, tx_size, dst,
  163|  19.3M|                              stride, eob, reduced_tx_set);
  164|  19.3M|  memset(dqcoeff, 0, (scan_line + 1) * sizeof(dqcoeff[0]));
  165|  19.3M|}
decodeframe.c:inverse_transform_inter_block:
  245|  9.18M|    const TX_SIZE tx_size) {
  246|  9.18M|  (void)r;
  247|  9.18M|  MACROBLOCKD *const xd = &dcb->xd;
  248|  9.18M|  PLANE_TYPE plane_type = get_plane_type(plane);
  249|  9.18M|  const struct macroblockd_plane *const pd = &xd->plane[plane];
  250|  9.18M|  const bool reduced_tx_set_used = cm->features.reduced_tx_set_used;
  251|       |  // tx_type was read out in av1_read_coeffs_txb.
  252|  9.18M|  const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col,
  253|  9.18M|                                          tx_size, reduced_tx_set_used);
  254|       |
  255|  9.18M|  uint8_t *dst =
  256|  9.18M|      &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2];
  ------------------
  |  |   39|  9.18M|#define MI_SIZE_LOG2 2
  ------------------
  257|  9.18M|  inverse_transform_block(dcb, plane, tx_type, tx_size, dst, pd->dst.stride,
  258|  9.18M|                          reduced_tx_set_used);
  259|       |#if CONFIG_MISMATCH_DEBUG
  260|       |  int pixel_c, pixel_r;
  261|       |  BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
  262|       |  int blk_w = block_size_wide[bsize];
  263|       |  int blk_h = block_size_high[bsize];
  264|       |  const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
  265|       |  const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
  266|       |  mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, blk_col, blk_row,
  267|       |                  pd->subsampling_x, pd->subsampling_y);
  268|       |  mismatch_check_block_tx(dst, pd->dst.stride, cm->current_frame.order_hint,
  269|       |                          plane, pixel_c, pixel_r, blk_w, blk_h,
  270|       |                          xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
  271|       |#endif
  272|  9.18M|}
decodeframe.c:predict_inter_block:
  848|  4.23M|                                       BLOCK_SIZE bsize) {
  849|  4.23M|  MACROBLOCKD *const xd = &dcb->xd;
  850|  4.23M|  MB_MODE_INFO *mbmi = xd->mi[0];
  851|  4.23M|  const int num_planes = av1_num_planes(cm);
  852|  4.23M|  const int mi_row = xd->mi_row;
  853|  4.23M|  const int mi_col = xd->mi_col;
  854|  9.04M|  for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
  ------------------
  |  Branch (854:21): [True: 4.80M, False: 4.23M]
  ------------------
  855|  4.80M|    const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
  856|  4.80M|    if (frame < LAST_FRAME) {
  ------------------
  |  Branch (856:9): [True: 52.5k, False: 4.75M]
  ------------------
  857|  52.5k|      assert(is_intrabc_block(mbmi));
  858|  52.5k|      assert(frame == INTRA_FRAME);
  859|  52.5k|      assert(ref == 0);
  860|  4.75M|    } else {
  861|  4.75M|      const RefCntBuffer *ref_buf = get_ref_frame_buf(cm, frame);
  862|  4.75M|      const struct scale_factors *ref_scale_factors =
  863|  4.75M|          get_ref_scale_factors_const(cm, frame);
  864|       |
  865|  4.75M|      xd->block_ref_scale_factors[ref] = ref_scale_factors;
  866|  4.75M|      av1_setup_pre_planes(xd, ref, &ref_buf->buf, mi_row, mi_col,
  867|  4.75M|                           ref_scale_factors, num_planes);
  868|  4.75M|    }
  869|  4.80M|  }
  870|       |
  871|  4.23M|  dec_build_inter_predictor(cm, dcb, mi_row, mi_col, bsize);
  872|  4.23M|  if (mbmi->motion_mode == OBMC_CAUSAL) {
  ------------------
  |  Branch (872:7): [True: 633k, False: 3.60M]
  ------------------
  873|   633k|    dec_build_obmc_inter_predictors_sb(cm, dcb);
  874|   633k|  }
  875|       |#if CONFIG_MISMATCH_DEBUG
  876|       |  for (int plane = 0; plane < num_planes; ++plane) {
  877|       |    const struct macroblockd_plane *pd = &xd->plane[plane];
  878|       |    int pixel_c, pixel_r;
  879|       |    mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0, pd->subsampling_x,
  880|       |                    pd->subsampling_y);
  881|       |    if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
  882|       |                             pd->subsampling_y))
  883|       |      continue;
  884|       |    mismatch_check_block_pre(pd->dst.buf, pd->dst.stride,
  885|       |                             cm->current_frame.order_hint, plane, pixel_c,
  886|       |                             pixel_r, pd->width, pd->height,
  887|       |                             xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
  888|       |  }
  889|       |#endif
  890|  4.23M|}
decodeframe.c:dec_build_inter_predictor:
  678|  4.23M|                                             BLOCK_SIZE bsize) {
  679|  4.23M|  MACROBLOCKD *const xd = &dcb->xd;
  680|  4.23M|  const int num_planes = av1_num_planes(cm);
  681|  15.4M|  for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (681:23): [True: 11.9M, False: 3.49M]
  ------------------
  682|  11.9M|    if (plane && !xd->is_chroma_ref) break;
  ------------------
  |  Branch (682:9): [True: 7.69M, False: 4.23M]
  |  Branch (682:18): [True: 745k, False: 6.94M]
  ------------------
  683|  11.1M|    const int mi_x = mi_col * MI_SIZE;
  ------------------
  |  |   40|  11.1M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  11.1M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  684|  11.1M|    const int mi_y = mi_row * MI_SIZE;
  ------------------
  |  |   40|  11.1M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  11.1M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  685|  11.1M|    dec_build_inter_predictors(cm, dcb, plane, xd->mi[0], 0,
  686|  11.1M|                               xd->plane[plane].width, xd->plane[plane].height,
  687|  11.1M|                               mi_x, mi_y);
  688|  11.1M|    if (is_interintra_pred(xd->mi[0])) {
  ------------------
  |  Branch (688:9): [True: 958k, False: 10.2M]
  ------------------
  689|   958k|      BUFFER_SET ctx = { { xd->plane[0].dst.buf, xd->plane[1].dst.buf,
  690|   958k|                           xd->plane[2].dst.buf },
  691|   958k|                         { xd->plane[0].dst.stride, xd->plane[1].dst.stride,
  692|   958k|                           xd->plane[2].dst.stride } };
  693|   958k|      av1_build_interintra_predictor(cm, xd, xd->plane[plane].dst.buf,
  694|   958k|                                     xd->plane[plane].dst.stride, &ctx, plane,
  695|   958k|                                     bsize);
  696|   958k|    }
  697|  11.1M|  }
  698|  4.23M|}
decodeframe.c:dec_build_inter_predictors:
  670|  14.0M|                                       int mi_x, int mi_y) {
  671|  14.0M|  build_inter_predictors(cm, &dcb->xd, plane, mi, build_for_obmc, bw, bh, mi_x,
  672|  14.0M|                         mi_y, dcb->mc_buf);
  673|  14.0M|}
decodeframe.c:dec_calc_subpel_params_and_extend:
  648|  17.0M|    uint8_t **pre, SubpelParams *subpel_params, int *src_stride) {
  649|  17.0M|  PadBlock block;
  650|  17.0M|  MV32 scaled_mv;
  651|  17.0M|  int subpel_x_mv, subpel_y_mv;
  652|  17.0M|  dec_calc_subpel_params(src_mv, inter_pred_params, xd, mi_x, mi_y, pre,
  653|  17.0M|                         subpel_params, src_stride, &block, &scaled_mv,
  654|  17.0M|                         &subpel_x_mv, &subpel_y_mv);
  655|  17.0M|  extend_mc_border(
  656|  17.0M|      inter_pred_params->scale_factors, &inter_pred_params->ref_frame_buf,
  657|  17.0M|      scaled_mv, block, subpel_x_mv, subpel_y_mv,
  658|  17.0M|      inter_pred_params->mode == WARP_PRED, inter_pred_params->is_intrabc,
  659|  17.0M|      inter_pred_params->use_hbd_buf, mc_buf[ref], pre, src_stride);
  660|  17.0M|}
decodeframe.c:dec_calc_subpel_params:
  563|  17.0M|    MV32 *scaled_mv, int *subpel_x_mv, int *subpel_y_mv) {
  564|  17.0M|  const struct scale_factors *sf = inter_pred_params->scale_factors;
  565|  17.0M|  struct buf_2d *pre_buf = &inter_pred_params->ref_frame_buf;
  566|  17.0M|  const int bw = inter_pred_params->block_width;
  567|  17.0M|  const int bh = inter_pred_params->block_height;
  568|  17.0M|  const int is_scaled = av1_is_scaled(sf);
  569|  17.0M|  if (is_scaled) {
  ------------------
  |  Branch (569:7): [True: 2.31M, False: 14.7M]
  ------------------
  570|  2.31M|    int ssx = inter_pred_params->subsampling_x;
  571|  2.31M|    int ssy = inter_pred_params->subsampling_y;
  572|  2.31M|    int orig_pos_y = inter_pred_params->pix_row << SUBPEL_BITS;
  ------------------
  |  |   23|  2.31M|#define SUBPEL_BITS 4
  ------------------
  573|  2.31M|    orig_pos_y += src_mv->row * (1 << (1 - ssy));
  574|  2.31M|    int orig_pos_x = inter_pred_params->pix_col << SUBPEL_BITS;
  ------------------
  |  |   23|  2.31M|#define SUBPEL_BITS 4
  ------------------
  575|  2.31M|    orig_pos_x += src_mv->col * (1 << (1 - ssx));
  576|  2.31M|    int pos_y = av1_scaled_y(orig_pos_y, sf);
  577|  2.31M|    int pos_x = av1_scaled_x(orig_pos_x, sf);
  578|  2.31M|    pos_x += SCALE_EXTRA_OFF;
  ------------------
  |  |   32|  2.31M|#define SCALE_EXTRA_OFF ((1 << SCALE_EXTRA_BITS) / 2)
  |  |  ------------------
  |  |  |  |   31|  2.31M|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   28|  2.31M|#define SCALE_SUBPEL_BITS 10
  |  |  |  |  ------------------
  |  |  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   23|  2.31M|#define SUBPEL_BITS 4
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  579|  2.31M|    pos_y += SCALE_EXTRA_OFF;
  ------------------
  |  |   32|  2.31M|#define SCALE_EXTRA_OFF ((1 << SCALE_EXTRA_BITS) / 2)
  |  |  ------------------
  |  |  |  |   31|  2.31M|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   28|  2.31M|#define SCALE_SUBPEL_BITS 10
  |  |  |  |  ------------------
  |  |  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   23|  2.31M|#define SUBPEL_BITS 4
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  580|       |
  581|  2.31M|    const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
  ------------------
  |  |   32|  2.31M|  (AOM_LEFT_TOP_MARGIN_PX(subsampling) << SCALE_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   30|  2.31M|  ((AOM_BORDER_IN_PIXELS >> subsampling) - AOM_INTERP_EXTEND)
  |  |  |  |  ------------------
  |  |  |  |  |  |   32|  2.31M|#define AOM_BORDER_IN_PIXELS 288
  |  |  |  |  ------------------
  |  |  |  |                 ((AOM_BORDER_IN_PIXELS >> subsampling) - AOM_INTERP_EXTEND)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|  2.31M|#define AOM_INTERP_EXTEND 4
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (AOM_LEFT_TOP_MARGIN_PX(subsampling) << SCALE_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|  2.31M|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  ------------------
  582|  2.31M|    const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
  ------------------
  |  |   32|  2.31M|  (AOM_LEFT_TOP_MARGIN_PX(subsampling) << SCALE_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   30|  2.31M|  ((AOM_BORDER_IN_PIXELS >> subsampling) - AOM_INTERP_EXTEND)
  |  |  |  |  ------------------
  |  |  |  |  |  |   32|  2.31M|#define AOM_BORDER_IN_PIXELS 288
  |  |  |  |  ------------------
  |  |  |  |                 ((AOM_BORDER_IN_PIXELS >> subsampling) - AOM_INTERP_EXTEND)
  |  |  |  |  ------------------
  |  |  |  |  |  |   31|  2.31M|#define AOM_INTERP_EXTEND 4
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (AOM_LEFT_TOP_MARGIN_PX(subsampling) << SCALE_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|  2.31M|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  ------------------
  583|  2.31M|    const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
  ------------------
  |  |   31|  2.31M|#define AOM_INTERP_EXTEND 4
  ------------------
  584|  2.31M|                       << SCALE_SUBPEL_BITS;
  ------------------
  |  |   28|  2.31M|#define SCALE_SUBPEL_BITS 10
  ------------------
  585|  2.31M|    const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
  ------------------
  |  |   31|  2.31M|#define AOM_INTERP_EXTEND 4
  ------------------
                  const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
  ------------------
  |  |   28|  2.31M|#define SCALE_SUBPEL_BITS 10
  ------------------
  586|  2.31M|    pos_y = clamp(pos_y, top, bottom);
  587|  2.31M|    pos_x = clamp(pos_x, left, right);
  588|       |
  589|  2.31M|    subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK;
  ------------------
  |  |   30|  2.31M|#define SCALE_SUBPEL_MASK (SCALE_SUBPEL_SHIFTS - 1)
  |  |  ------------------
  |  |  |  |   29|  2.31M|#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   28|  2.31M|#define SCALE_SUBPEL_BITS 10
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  590|  2.31M|    subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK;
  ------------------
  |  |   30|  2.31M|#define SCALE_SUBPEL_MASK (SCALE_SUBPEL_SHIFTS - 1)
  |  |  ------------------
  |  |  |  |   29|  2.31M|#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   28|  2.31M|#define SCALE_SUBPEL_BITS 10
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  591|  2.31M|    subpel_params->xs = sf->x_step_q4;
  592|  2.31M|    subpel_params->ys = sf->y_step_q4;
  593|       |
  594|       |    // Get reference block top left coordinate.
  595|  2.31M|    block->x0 = pos_x >> SCALE_SUBPEL_BITS;
  ------------------
  |  |   28|  2.31M|#define SCALE_SUBPEL_BITS 10
  ------------------
  596|  2.31M|    block->y0 = pos_y >> SCALE_SUBPEL_BITS;
  ------------------
  |  |   28|  2.31M|#define SCALE_SUBPEL_BITS 10
  ------------------
  597|       |
  598|       |    // Get reference block bottom right coordinate.
  599|  2.31M|    block->x1 =
  600|  2.31M|        ((pos_x + (bw - 1) * subpel_params->xs) >> SCALE_SUBPEL_BITS) + 1;
  ------------------
  |  |   28|  2.31M|#define SCALE_SUBPEL_BITS 10
  ------------------
  601|  2.31M|    block->y1 =
  602|  2.31M|        ((pos_y + (bh - 1) * subpel_params->ys) >> SCALE_SUBPEL_BITS) + 1;
  ------------------
  |  |   28|  2.31M|#define SCALE_SUBPEL_BITS 10
  ------------------
  603|       |
  604|  2.31M|    MV temp_mv;
  605|  2.31M|    temp_mv = clamp_mv_to_umv_border_sb(xd, src_mv, bw, bh,
  606|  2.31M|                                        inter_pred_params->subsampling_x,
  607|  2.31M|                                        inter_pred_params->subsampling_y);
  608|  2.31M|    *scaled_mv = av1_scale_mv(&temp_mv, mi_x, mi_y, sf);
  609|  2.31M|    scaled_mv->row += SCALE_EXTRA_OFF;
  ------------------
  |  |   32|  2.31M|#define SCALE_EXTRA_OFF ((1 << SCALE_EXTRA_BITS) / 2)
  |  |  ------------------
  |  |  |  |   31|  2.31M|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   28|  2.31M|#define SCALE_SUBPEL_BITS 10
  |  |  |  |  ------------------
  |  |  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   23|  2.31M|#define SUBPEL_BITS 4
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  610|  2.31M|    scaled_mv->col += SCALE_EXTRA_OFF;
  ------------------
  |  |   32|  2.31M|#define SCALE_EXTRA_OFF ((1 << SCALE_EXTRA_BITS) / 2)
  |  |  ------------------
  |  |  |  |   31|  2.31M|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   28|  2.31M|#define SCALE_SUBPEL_BITS 10
  |  |  |  |  ------------------
  |  |  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   23|  2.31M|#define SUBPEL_BITS 4
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  611|       |
  612|  2.31M|    *subpel_x_mv = scaled_mv->col & SCALE_SUBPEL_MASK;
  ------------------
  |  |   30|  2.31M|#define SCALE_SUBPEL_MASK (SCALE_SUBPEL_SHIFTS - 1)
  |  |  ------------------
  |  |  |  |   29|  2.31M|#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   28|  2.31M|#define SCALE_SUBPEL_BITS 10
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  613|  2.31M|    *subpel_y_mv = scaled_mv->row & SCALE_SUBPEL_MASK;
  ------------------
  |  |   30|  2.31M|#define SCALE_SUBPEL_MASK (SCALE_SUBPEL_SHIFTS - 1)
  |  |  ------------------
  |  |  |  |   29|  2.31M|#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
  |  |  |  |  ------------------
  |  |  |  |  |  |   28|  2.31M|#define SCALE_SUBPEL_BITS 10
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  614|  14.7M|  } else {
  615|       |    // Get block position in current frame.
  616|  14.7M|    int pos_x = inter_pred_params->pix_col << SUBPEL_BITS;
  ------------------
  |  |   23|  14.7M|#define SUBPEL_BITS 4
  ------------------
  617|  14.7M|    int pos_y = inter_pred_params->pix_row << SUBPEL_BITS;
  ------------------
  |  |   23|  14.7M|#define SUBPEL_BITS 4
  ------------------
  618|       |
  619|  14.7M|    const MV mv_q4 = clamp_mv_to_umv_border_sb(
  620|  14.7M|        xd, src_mv, bw, bh, inter_pred_params->subsampling_x,
  621|  14.7M|        inter_pred_params->subsampling_y);
  622|  14.7M|    subpel_params->xs = subpel_params->ys = SCALE_SUBPEL_SHIFTS;
  ------------------
  |  |   29|  14.7M|#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|  14.7M|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  ------------------
  623|  14.7M|    subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
  ------------------
  |  |   24|  14.7M|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  14.7M|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
                  subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
  ------------------
  |  |   31|  14.7M|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|  14.7M|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|  14.7M|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  624|  14.7M|    subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
  ------------------
  |  |   24|  14.7M|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  14.7M|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
                  subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
  ------------------
  |  |   31|  14.7M|#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   28|  14.7M|#define SCALE_SUBPEL_BITS 10
  |  |  ------------------
  |  |               #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|  14.7M|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  625|       |
  626|       |    // Get reference block top left coordinate.
  627|  14.7M|    pos_x += mv_q4.col;
  628|  14.7M|    pos_y += mv_q4.row;
  629|  14.7M|    block->x0 = pos_x >> SUBPEL_BITS;
  ------------------
  |  |   23|  14.7M|#define SUBPEL_BITS 4
  ------------------
  630|  14.7M|    block->y0 = pos_y >> SUBPEL_BITS;
  ------------------
  |  |   23|  14.7M|#define SUBPEL_BITS 4
  ------------------
  631|       |
  632|       |    // Get reference block bottom right coordinate.
  633|  14.7M|    block->x1 = (pos_x >> SUBPEL_BITS) + (bw - 1) + 1;
  ------------------
  |  |   23|  14.7M|#define SUBPEL_BITS 4
  ------------------
  634|  14.7M|    block->y1 = (pos_y >> SUBPEL_BITS) + (bh - 1) + 1;
  ------------------
  |  |   23|  14.7M|#define SUBPEL_BITS 4
  ------------------
  635|       |
  636|  14.7M|    scaled_mv->row = mv_q4.row;
  637|  14.7M|    scaled_mv->col = mv_q4.col;
  638|  14.7M|    *subpel_x_mv = scaled_mv->col & SUBPEL_MASK;
  ------------------
  |  |   24|  14.7M|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  14.7M|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  639|  14.7M|    *subpel_y_mv = scaled_mv->row & SUBPEL_MASK;
  ------------------
  |  |   24|  14.7M|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  14.7M|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  640|  14.7M|  }
  641|  17.0M|  *pre = pre_buf->buf0 + block->y0 * pre_buf->stride + block->x0;
  642|  17.0M|  *src_stride = pre_buf->stride;
  643|  17.0M|}
decodeframe.c:extend_mc_border:
  526|  17.0M|                                    int *src_stride) {
  527|  17.0M|  int x_pad = 0, y_pad = 0;
  528|  17.0M|  if (update_extend_mc_border_params(sf, pre_buf, scaled_mv, &block,
  ------------------
  |  Branch (528:7): [True: 2.62M, False: 14.3M]
  ------------------
  529|  17.0M|                                     subpel_x_mv, subpel_y_mv, do_warp,
  530|  17.0M|                                     is_intrabc, &x_pad, &y_pad)) {
  531|       |    // Get reference block pointer.
  532|  2.62M|    const uint8_t *const buf_ptr =
  533|  2.62M|        pre_buf->buf0 + block.y0 * pre_buf->stride + block.x0;
  534|  2.62M|    int buf_stride = pre_buf->stride;
  535|  2.62M|    const int b_w = block.x1 - block.x0;
  536|  2.62M|    const int b_h = block.y1 - block.y0;
  537|       |
  538|  2.62M|#if CONFIG_AV1_HIGHBITDEPTH
  539|       |    // Extend the border.
  540|  2.62M|    if (highbd) {
  ------------------
  |  Branch (540:9): [True: 1.16M, False: 1.45M]
  ------------------
  541|  1.16M|      highbd_build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0,
  542|  1.16M|                             block.y0, b_w, b_h, pre_buf->width,
  543|  1.16M|                             pre_buf->height);
  544|  1.45M|    } else {
  545|  1.45M|      build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0, block.y0, b_w,
  546|  1.45M|                      b_h, pre_buf->width, pre_buf->height);
  547|  1.45M|    }
  548|       |#else
  549|       |    (void)highbd;
  550|       |    build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0, block.y0, b_w,
  551|       |                    b_h, pre_buf->width, pre_buf->height);
  552|       |#endif
  553|  2.62M|    *src_stride = b_w;
  554|  2.62M|    *pre = mc_buf + y_pad * (AOM_INTERP_EXTEND - 1) * b_w +
  ------------------
  |  |   31|  2.62M|#define AOM_INTERP_EXTEND 4
  ------------------
  555|  2.62M|           x_pad * (AOM_INTERP_EXTEND - 1);
  ------------------
  |  |   31|  2.62M|#define AOM_INTERP_EXTEND 4
  ------------------
  556|  2.62M|  }
  557|  17.0M|}
decodeframe.c:update_extend_mc_border_params:
  488|  17.0M|    int do_warp, int is_intrabc, int *x_pad, int *y_pad) {
  489|  17.0M|  const int is_scaled = av1_is_scaled(sf);
  490|       |  // Get reference width and height.
  491|  17.0M|  int frame_width = pre_buf->width;
  492|  17.0M|  int frame_height = pre_buf->height;
  493|       |
  494|       |  // Do border extension if there is motion or
  495|       |  // width/height is not a multiple of 8 pixels.
  496|  17.0M|  if ((!is_intrabc) && (!do_warp) &&
  ------------------
  |  Branch (496:7): [True: 16.8M, False: 148k]
  |  Branch (496:24): [True: 16.3M, False: 501k]
  ------------------
  497|  17.0M|      (is_scaled || scaled_mv.col || scaled_mv.row || (frame_width & 0x7) ||
  ------------------
  |  Branch (497:8): [True: 2.31M, False: 14.0M]
  |  Branch (497:21): [True: 9.78M, False: 4.27M]
  |  Branch (497:38): [True: 1.02M, False: 3.25M]
  |  Branch (497:55): [True: 615k, False: 2.64M]
  ------------------
  498|  16.3M|       (frame_height & 0x7))) {
  ------------------
  |  Branch (498:8): [True: 86.6k, False: 2.55M]
  ------------------
  499|  13.8M|    if (subpel_x_mv || (sf->x_step_q4 != SUBPEL_SHIFTS)) {
  ------------------
  |  |   25|  3.53M|#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|  3.53M|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  |  Branch (499:9): [True: 10.2M, False: 3.53M]
  |  Branch (499:24): [True: 3.53M, False: 0]
  ------------------
  500|  13.8M|      block->x0 -= AOM_INTERP_EXTEND - 1;
  ------------------
  |  |   31|  13.8M|#define AOM_INTERP_EXTEND 4
  ------------------
  501|  13.8M|      block->x1 += AOM_INTERP_EXTEND;
  ------------------
  |  |   31|  13.8M|#define AOM_INTERP_EXTEND 4
  ------------------
  502|  13.8M|      *x_pad = 1;
  503|  13.8M|    }
  504|       |
  505|  13.8M|    if (subpel_y_mv || (sf->y_step_q4 != SUBPEL_SHIFTS)) {
  ------------------
  |  |   25|  3.68M|#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
  |  |  ------------------
  |  |  |  |   23|  3.68M|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  |  Branch (505:9): [True: 10.1M, False: 3.68M]
  |  Branch (505:24): [True: 3.68M, False: 0]
  ------------------
  506|  13.8M|      block->y0 -= AOM_INTERP_EXTEND - 1;
  ------------------
  |  |   31|  13.8M|#define AOM_INTERP_EXTEND 4
  ------------------
  507|  13.8M|      block->y1 += AOM_INTERP_EXTEND;
  ------------------
  |  |   31|  13.8M|#define AOM_INTERP_EXTEND 4
  ------------------
  508|  13.8M|      *y_pad = 1;
  509|  13.8M|    }
  510|       |
  511|       |    // Skip border extension if block is inside the frame.
  512|  13.8M|    if (block->x0 < 0 || block->x1 > frame_width - 1 || block->y0 < 0 ||
  ------------------
  |  Branch (512:9): [True: 553k, False: 13.2M]
  |  Branch (512:26): [True: 811k, False: 12.4M]
  |  Branch (512:57): [True: 540k, False: 11.9M]
  ------------------
  513|  13.8M|        block->y1 > frame_height - 1) {
  ------------------
  |  Branch (513:9): [True: 713k, False: 11.1M]
  ------------------
  514|  2.62M|      return 1;
  515|  2.62M|    }
  516|  13.8M|  }
  517|  14.3M|  return 0;
  518|  17.0M|}
decodeframe.c:highbd_build_mc_border:
  412|  1.16M|                                          int h) {
  413|       |  // Get a pointer to the start of the real data for this row.
  414|  1.16M|  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  ------------------
  |  |   75|  1.16M|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  415|  1.16M|  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  ------------------
  |  |   75|  1.16M|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  416|  1.16M|  const uint16_t *ref_row = src - x - y * src_stride;
  417|       |
  418|  1.16M|  if (y >= h)
  ------------------
  |  Branch (418:7): [True: 150k, False: 1.01M]
  ------------------
  419|   150k|    ref_row += (h - 1) * src_stride;
  420|  1.01M|  else if (y > 0)
  ------------------
  |  Branch (420:12): [True: 598k, False: 420k]
  ------------------
  421|   598k|    ref_row += y * src_stride;
  422|       |
  423|  22.8M|  do {
  424|  22.8M|    int right = 0, copy;
  425|  22.8M|    int left = x < 0 ? -x : 0;
  ------------------
  |  Branch (425:16): [True: 5.35M, False: 17.5M]
  ------------------
  426|       |
  427|  22.8M|    if (left > b_w) left = b_w;
  ------------------
  |  Branch (427:9): [True: 692k, False: 22.1M]
  ------------------
  428|       |
  429|  22.8M|    if (x + b_w > w) right = x + b_w - w;
  ------------------
  |  Branch (429:9): [True: 6.22M, False: 16.6M]
  ------------------
  430|       |
  431|  22.8M|    if (right > b_w) right = b_w;
  ------------------
  |  Branch (431:9): [True: 186k, False: 22.6M]
  ------------------
  432|       |
  433|  22.8M|    copy = b_w - left - right;
  434|       |
  435|  22.8M|    if (left) aom_memset16(dst, ref_row[0], left);
  ------------------
  |  Branch (435:9): [True: 5.35M, False: 17.5M]
  ------------------
  436|       |
  437|  22.8M|    if (copy) memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t));
  ------------------
  |  Branch (437:9): [True: 19.5M, False: 3.37M]
  ------------------
  438|       |
  439|  22.8M|    if (right) aom_memset16(dst + left + copy, ref_row[w - 1], right);
  ------------------
  |  Branch (439:9): [True: 6.22M, False: 16.6M]
  ------------------
  440|       |
  441|  22.8M|    dst += dst_stride;
  442|  22.8M|    ++y;
  443|       |
  444|  22.8M|    if (y > 0 && y < h) ref_row += src_stride;
  ------------------
  |  Branch (444:9): [True: 19.9M, False: 2.91M]
  |  Branch (444:18): [True: 16.1M, False: 3.78M]
  ------------------
  445|  22.8M|  } while (--b_h);
  ------------------
  |  Branch (445:12): [True: 21.7M, False: 1.16M]
  ------------------
  446|  1.16M|}
decodeframe.c:build_mc_border:
  451|  1.45M|                                   int b_w, int b_h, int w, int h) {
  452|       |  // Get a pointer to the start of the real data for this row.
  453|  1.45M|  const uint8_t *ref_row = src - x - y * src_stride;
  454|       |
  455|  1.45M|  if (y >= h)
  ------------------
  |  Branch (455:7): [True: 370k, False: 1.08M]
  ------------------
  456|   370k|    ref_row += (h - 1) * src_stride;
  457|  1.08M|  else if (y > 0)
  ------------------
  |  Branch (457:12): [True: 583k, False: 504k]
  ------------------
  458|   583k|    ref_row += y * src_stride;
  459|       |
  460|  26.0M|  do {
  461|  26.0M|    int right = 0, copy;
  462|  26.0M|    int left = x < 0 ? -x : 0;
  ------------------
  |  Branch (462:16): [True: 6.56M, False: 19.4M]
  ------------------
  463|       |
  464|  26.0M|    if (left > b_w) left = b_w;
  ------------------
  |  Branch (464:9): [True: 902k, False: 25.1M]
  ------------------
  465|       |
  466|  26.0M|    if (x + b_w > w) right = x + b_w - w;
  ------------------
  |  Branch (466:9): [True: 6.53M, False: 19.5M]
  ------------------
  467|       |
  468|  26.0M|    if (right > b_w) right = b_w;
  ------------------
  |  Branch (468:9): [True: 1.10M, False: 24.9M]
  ------------------
  469|       |
  470|  26.0M|    copy = b_w - left - right;
  471|       |
  472|  26.0M|    if (left) memset(dst, ref_row[0], left);
  ------------------
  |  Branch (472:9): [True: 6.56M, False: 19.4M]
  ------------------
  473|       |
  474|  26.0M|    if (copy) memcpy(dst + left, ref_row + x + left, copy);
  ------------------
  |  Branch (474:9): [True: 20.2M, False: 5.78M]
  ------------------
  475|       |
  476|  26.0M|    if (right) memset(dst + left + copy, ref_row[w - 1], right);
  ------------------
  |  Branch (476:9): [True: 6.53M, False: 19.5M]
  ------------------
  477|       |
  478|  26.0M|    dst += dst_stride;
  479|  26.0M|    ++y;
  480|       |
  481|  26.0M|    if (y > 0 && y < h) ref_row += src_stride;
  ------------------
  |  Branch (481:9): [True: 21.2M, False: 4.81M]
  |  Branch (481:18): [True: 13.1M, False: 8.05M]
  ------------------
  482|  26.0M|  } while (--b_h);
  ------------------
  |  Branch (482:12): [True: 24.6M, False: 1.45M]
  ------------------
  483|  1.45M|}
decodeframe.c:dec_build_obmc_inter_predictors_sb:
  813|   633k|                                                      DecoderCodingBlock *dcb) {
  814|   633k|  const int num_planes = av1_num_planes(cm);
  815|   633k|  uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
  816|   633k|  int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|   633k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   633k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
                int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|   633k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   633k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
                int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|   633k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   633k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  817|   633k|  int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|   633k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   633k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
                int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|   633k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   633k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
                int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|   633k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   633k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  818|   633k|  int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|   633k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   633k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
                int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|   633k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   633k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
                int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|   633k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   633k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  819|   633k|  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|   633k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   633k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
                int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|   633k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   633k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
                int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|   633k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   633k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  820|   633k|  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|   633k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   633k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
                int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|   633k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   633k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
                int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|   633k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   633k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  821|   633k|  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|   633k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   633k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
                int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|   633k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   633k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
                int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  ------------------
  |  |   32|   633k|#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   31|   633k|#define MAX_SB_SIZE_LOG2 7
  |  |  ------------------
  ------------------
  822|       |
  823|   633k|  MACROBLOCKD *const xd = &dcb->xd;
  824|   633k|  av1_setup_obmc_dst_bufs(xd, dst_buf1, dst_buf2);
  825|       |
  826|   633k|  dec_build_prediction_by_above_preds(cm, dcb, dst_buf1, dst_width1,
  827|   633k|                                      dst_height1, dst_stride1);
  828|   633k|  dec_build_prediction_by_left_preds(cm, dcb, dst_buf2, dst_width2, dst_height2,
  829|   633k|                                     dst_stride2);
  830|   633k|  const int mi_row = xd->mi_row;
  831|   633k|  const int mi_col = xd->mi_col;
  832|   633k|  av1_setup_dst_planes(xd->plane, xd->mi[0]->bsize, &cm->cur_frame->buf, mi_row,
  833|   633k|                       mi_col, 0, num_planes);
  834|   633k|  av1_build_obmc_inter_prediction(cm, xd, dst_buf1, dst_stride1, dst_buf2,
  835|   633k|                                  dst_stride2);
  836|   633k|}
decodeframe.c:dec_build_prediction_by_above_preds:
  733|   633k|    int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]) {
  734|   633k|  MACROBLOCKD *const xd = &dcb->xd;
  735|   633k|  if (!xd->up_available) return;
  ------------------
  |  Branch (735:7): [True: 8.87k, False: 624k]
  ------------------
  736|       |
  737|       |  // Adjust mb_to_bottom_edge to have the correct value for the OBMC
  738|       |  // prediction block. This is half the height of the original block,
  739|       |  // except for 128-wide blocks, where we only use a height of 32.
  740|   624k|  const int this_height = xd->height * MI_SIZE;
  ------------------
  |  |   40|   624k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   624k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  741|   624k|  const int pred_height = AOMMIN(this_height / 2, 32);
  ------------------
  |  |   34|   624k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 617k, False: 7.16k]
  |  |  ------------------
  ------------------
  742|   624k|  xd->mb_to_bottom_edge += GET_MV_SUBPEL(this_height - pred_height);
  ------------------
  |  |   29|   624k|#define GET_MV_SUBPEL(x) ((x)*8)
  ------------------
  743|   624k|  struct build_prediction_ctxt ctxt = {
  744|   624k|    cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_right_edge, dcb
  745|   624k|  };
  746|   624k|  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
  747|   624k|  foreach_overlappable_nb_above(cm, xd,
  748|   624k|                                max_neighbor_obmc[mi_size_wide_log2[bsize]],
  749|   624k|                                dec_build_prediction_by_above_pred, &ctxt);
  750|       |
  751|   624k|  xd->mb_to_left_edge = -GET_MV_SUBPEL(xd->mi_col * MI_SIZE);
  ------------------
  |  |   29|   624k|#define GET_MV_SUBPEL(x) ((x)*8)
  ------------------
  752|   624k|  xd->mb_to_right_edge = ctxt.mb_to_far_edge;
  753|   624k|  xd->mb_to_bottom_edge -= GET_MV_SUBPEL(this_height - pred_height);
  ------------------
  |  |   29|   624k|#define GET_MV_SUBPEL(x) ((x)*8)
  ------------------
  754|   624k|}
decodeframe.c:dec_build_prediction_by_above_pred:
  702|   627k|    int dir, MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) {
  703|   627k|  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
  704|   627k|  const int above_mi_col = xd->mi_col + rel_mi_col;
  705|   627k|  int mi_x, mi_y;
  706|   627k|  MB_MODE_INFO backup_mbmi = *above_mbmi;
  707|       |
  708|   627k|  (void)rel_mi_row;
  709|   627k|  (void)dir;
  710|       |
  711|   627k|  av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, op_mi_size,
  712|   627k|                                           &backup_mbmi, ctxt, num_planes);
  713|   627k|  mi_x = above_mi_col << MI_SIZE_LOG2;
  ------------------
  |  |   39|   627k|#define MI_SIZE_LOG2 2
  ------------------
  714|   627k|  mi_y = xd->mi_row << MI_SIZE_LOG2;
  ------------------
  |  |   39|   627k|#define MI_SIZE_LOG2 2
  ------------------
  715|       |
  716|   627k|  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
  717|       |
  718|  2.50M|  for (int j = 0; j < num_planes; ++j) {
  ------------------
  |  Branch (718:19): [True: 1.87M, False: 627k]
  ------------------
  719|  1.87M|    const struct macroblockd_plane *pd = &xd->plane[j];
  720|  1.87M|    int bw = (op_mi_size * MI_SIZE) >> pd->subsampling_x;
  ------------------
  |  |   40|  1.87M|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|  1.87M|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  721|  1.87M|    int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4,
  722|  1.87M|                   block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
  723|       |
  724|  1.87M|    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
  ------------------
  |  Branch (724:9): [True: 856k, False: 1.02M]
  ------------------
  725|  1.02M|    dec_build_inter_predictors(ctxt->cm, (DecoderCodingBlock *)ctxt->dcb, j,
  726|  1.02M|                               &backup_mbmi, 1, bw, bh, mi_x, mi_y);
  727|  1.02M|  }
  728|   627k|}
decodeframe.c:dec_build_prediction_by_left_preds:
  788|   633k|    int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]) {
  789|   633k|  MACROBLOCKD *const xd = &dcb->xd;
  790|   633k|  if (!xd->left_available) return;
  ------------------
  |  Branch (790:7): [True: 5.97k, False: 627k]
  ------------------
  791|       |
  792|       |  // Adjust mb_to_right_edge to have the correct value for the OBMC
  793|       |  // prediction block. This is half the width of the original block,
  794|       |  // except for 128-wide blocks, where we only use a width of 32.
  795|   627k|  const int this_width = xd->width * MI_SIZE;
  ------------------
  |  |   40|   627k|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|   627k|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  796|   627k|  const int pred_width = AOMMIN(this_width / 2, 32);
  ------------------
  |  |   34|   627k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 619k, False: 7.26k]
  |  |  ------------------
  ------------------
  797|   627k|  xd->mb_to_right_edge += GET_MV_SUBPEL(this_width - pred_width);
  ------------------
  |  |   29|   627k|#define GET_MV_SUBPEL(x) ((x)*8)
  ------------------
  798|       |
  799|   627k|  struct build_prediction_ctxt ctxt = {
  800|   627k|    cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_bottom_edge, dcb
  801|   627k|  };
  802|   627k|  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
  803|   627k|  foreach_overlappable_nb_left(cm, xd,
  804|   627k|                               max_neighbor_obmc[mi_size_high_log2[bsize]],
  805|   627k|                               dec_build_prediction_by_left_pred, &ctxt);
  806|       |
  807|   627k|  xd->mb_to_top_edge = -GET_MV_SUBPEL(xd->mi_row * MI_SIZE);
  ------------------
  |  |   29|   627k|#define GET_MV_SUBPEL(x) ((x)*8)
  ------------------
  808|   627k|  xd->mb_to_right_edge -= GET_MV_SUBPEL(this_width - pred_width);
  ------------------
  |  |   29|   627k|#define GET_MV_SUBPEL(x) ((x)*8)
  ------------------
  809|   627k|  xd->mb_to_bottom_edge = ctxt.mb_to_far_edge;
  810|   627k|}
decodeframe.c:dec_build_prediction_by_left_pred:
  758|   622k|    int dir, MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) {
  759|   622k|  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
  760|   622k|  const int left_mi_row = xd->mi_row + rel_mi_row;
  761|   622k|  int mi_x, mi_y;
  762|   622k|  MB_MODE_INFO backup_mbmi = *left_mbmi;
  763|       |
  764|   622k|  (void)rel_mi_col;
  765|   622k|  (void)dir;
  766|       |
  767|   622k|  av1_setup_build_prediction_by_left_pred(xd, rel_mi_row, op_mi_size,
  768|   622k|                                          &backup_mbmi, ctxt, num_planes);
  769|   622k|  mi_x = xd->mi_col << MI_SIZE_LOG2;
  ------------------
  |  |   39|   622k|#define MI_SIZE_LOG2 2
  ------------------
  770|   622k|  mi_y = left_mi_row << MI_SIZE_LOG2;
  ------------------
  |  |   39|   622k|#define MI_SIZE_LOG2 2
  ------------------
  771|   622k|  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
  772|       |
  773|  2.48M|  for (int j = 0; j < num_planes; ++j) {
  ------------------
  |  Branch (773:19): [True: 1.86M, False: 622k]
  ------------------
  774|  1.86M|    const struct macroblockd_plane *pd = &xd->plane[j];
  775|  1.86M|    int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4,
  776|  1.86M|                   block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
  777|  1.86M|    int bh = (op_mi_size << MI_SIZE_LOG2) >> pd->subsampling_y;
  ------------------
  |  |   39|  1.86M|#define MI_SIZE_LOG2 2
  ------------------
  778|       |
  779|  1.86M|    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
  ------------------
  |  Branch (779:9): [True: 0, False: 1.86M]
  ------------------
  780|  1.86M|    dec_build_inter_predictors(ctxt->cm, (DecoderCodingBlock *)ctxt->dcb, j,
  781|  1.86M|                               &backup_mbmi, 1, bw, bh, mi_x, mi_y);
  782|  1.86M|  }
  783|   622k|}
decodeframe.c:cfl_store_inter_block:
  839|  4.24M|                                         MACROBLOCKD *const xd) {
  840|  4.24M|  MB_MODE_INFO *mbmi = xd->mi[0];
  841|  4.24M|  if (store_cfl_required(cm, xd)) {
  ------------------
  |  Branch (841:7): [True: 745k, False: 3.49M]
  ------------------
  842|   745k|    cfl_store_block(xd, mbmi->bsize, mbmi->tx_size);
  843|   745k|  }
  844|  4.24M|}
decodeframe.c:get_dec_job_info:
 2908|   189k|static TileJobsDec *get_dec_job_info(AV1DecTileMT *tile_mt_info) {
 2909|   189k|  TileJobsDec *cur_job_info = NULL;
 2910|   189k|#if CONFIG_MULTITHREAD
 2911|   189k|  pthread_mutex_lock(tile_mt_info->job_mutex);
 2912|       |
 2913|   189k|  if (tile_mt_info->jobs_dequeued < tile_mt_info->jobs_enqueued) {
  ------------------
  |  Branch (2913:7): [True: 87.6k, False: 101k]
  ------------------
 2914|  87.6k|    cur_job_info = tile_mt_info->job_queue + tile_mt_info->jobs_dequeued;
 2915|  87.6k|    tile_mt_info->jobs_dequeued++;
 2916|  87.6k|  }
 2917|       |
 2918|   189k|  pthread_mutex_unlock(tile_mt_info->job_mutex);
 2919|       |#else
 2920|       |  (void)tile_mt_info;
 2921|       |#endif
 2922|   189k|  return cur_job_info;
 2923|   189k|}
decodeframe.c:tile_worker_hook_init:
 2929|  87.6k|                                         uint8_t allow_update_cdf) {
 2930|  87.6k|  AV1_COMMON *cm = &pbi->common;
 2931|  87.6k|  ThreadData *const td = thread_data->td;
 2932|  87.6k|  int tile_row = tile_data->tile_info.tile_row;
 2933|  87.6k|  int tile_col = tile_data->tile_info.tile_col;
 2934|       |
 2935|  87.6k|  td->bit_reader = &tile_data->bit_reader;
 2936|  87.6k|  av1_zero(td->cb_buffer_base.dqcoeff);
  ------------------
  |  |   43|  87.6k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
 2937|       |
 2938|  87.6k|  MACROBLOCKD *const xd = &td->dcb.xd;
 2939|  87.6k|  av1_tile_init(&xd->tile, cm, tile_row, tile_col);
 2940|  87.6k|  xd->current_base_qindex = cm->quant_params.base_qindex;
 2941|       |
 2942|  87.6k|  setup_bool_decoder(xd, tile_buffer->data, thread_data->data_end,
 2943|  87.6k|                     tile_buffer->size, &thread_data->error_info,
 2944|  87.6k|                     td->bit_reader, allow_update_cdf);
 2945|       |#if CONFIG_ACCOUNTING
 2946|       |  if (pbi->acct_enabled) {
 2947|       |    td->bit_reader->accounting = &pbi->accounting;
 2948|       |    td->bit_reader->accounting->last_tell_frac =
 2949|       |        aom_reader_tell_frac(td->bit_reader);
 2950|       |  } else {
 2951|       |    td->bit_reader->accounting = NULL;
 2952|       |  }
 2953|       |#endif
 2954|  87.6k|  av1_init_macroblockd(cm, xd);
 2955|  87.6k|  xd->error_info = &thread_data->error_info;
 2956|  87.6k|  av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row, xd);
 2957|       |
 2958|       |  // Initialise the tile context from the frame context
 2959|  87.6k|  tile_data->tctx = *cm->fc;
 2960|  87.6k|  xd->tile_ctx = &tile_data->tctx;
 2961|       |#if CONFIG_ACCOUNTING
 2962|       |  if (pbi->acct_enabled) {
 2963|       |    tile_data->bit_reader.accounting->last_tell_frac =
 2964|       |        aom_reader_tell_frac(&tile_data->bit_reader);
 2965|       |  }
 2966|       |#endif
 2967|  87.6k|}
decodeframe.c:setup_bool_decoder:
 1405|   163k|    aom_reader *r, uint8_t allow_update_cdf) {
 1406|       |  // Validate the calculated partition length. If the buffer
 1407|       |  // described by the partition can't be fully read, then restrict
 1408|       |  // it to the portion that can be (for EC mode) or throw an error.
 1409|   163k|  if (!read_is_valid(data, read_size, data_end)) {
  ------------------
  |  Branch (1409:7): [True: 654, False: 162k]
  ------------------
 1410|       |    // When internal error occurs ensure that xd->mi_row is set appropriately
 1411|       |    // w.r.t. current tile, which is used to signal processing of current row is
 1412|       |    // done in row-mt decoding.
 1413|    654|    xd->mi_row = xd->tile.mi_row_start;
 1414|       |
 1415|    654|    aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
 1416|    654|                       "Truncated packet or corrupt tile length");
 1417|    654|  }
 1418|   163k|  if (aom_reader_init(r, data, read_size)) {
  ------------------
  |  Branch (1418:7): [True: 0, False: 163k]
  ------------------
 1419|       |    // When internal error occurs ensure that xd->mi_row is set appropriately
 1420|       |    // w.r.t. current tile, which is used to signal processing of current row is
 1421|       |    // done in row-mt decoding.
 1422|      0|    xd->mi_row = xd->tile.mi_row_start;
 1423|       |
 1424|      0|    aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
 1425|      0|                       "Failed to allocate bool decoder %d", 1);
 1426|      0|  }
 1427|       |
 1428|   163k|  r->allow_update_cdf = allow_update_cdf;
 1429|   163k|}
decodeframe.c:parse_tile_row_mt:
 3164|  87.2k|                                     TileDataDec *const tile_data) {
 3165|  87.2k|  AV1_COMMON *const cm = &pbi->common;
 3166|  87.2k|  const int sb_mi_size = mi_size_wide[cm->seq_params->sb_size];
 3167|  87.2k|  const int num_planes = av1_num_planes(cm);
 3168|  87.2k|  const TileInfo *const tile_info = &tile_data->tile_info;
 3169|  87.2k|  int tile_row = tile_info->tile_row;
 3170|  87.2k|  DecoderCodingBlock *const dcb = &td->dcb;
 3171|  87.2k|  MACROBLOCKD *const xd = &dcb->xd;
 3172|       |
 3173|  87.2k|  av1_zero_above_context(cm, xd, tile_info->mi_col_start, tile_info->mi_col_end,
 3174|  87.2k|                         tile_row);
 3175|  87.2k|  av1_reset_loop_filter_delta(xd, num_planes);
 3176|  87.2k|  av1_reset_loop_restoration(xd, num_planes);
 3177|       |
 3178|   222k|  for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
  ------------------
  |  Branch (3178:46): [True: 151k, False: 71.2k]
  ------------------
 3179|   151k|       mi_row += cm->seq_params->mib_size) {
 3180|   151k|    av1_zero_left_context(xd);
 3181|       |
 3182|   627k|    for (int mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
  ------------------
  |  Branch (3182:48): [True: 492k, False: 135k]
  ------------------
 3183|   492k|         mi_col += cm->seq_params->mib_size) {
 3184|   492k|      set_cb_buffer(pbi, dcb, pbi->cb_buffer_base, num_planes, mi_row, mi_col);
 3185|       |
 3186|       |      // Bit-stream parsing of the superblock
 3187|   492k|      decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
 3188|   492k|                       cm->seq_params->sb_size, 0x1);
 3189|       |
 3190|   492k|      if (aom_reader_has_overflowed(td->bit_reader)) {
  ------------------
  |  Branch (3190:11): [True: 15.9k, False: 476k]
  ------------------
 3191|  15.9k|        aom_merge_corrupted_flag(&dcb->corrupted, 1);
 3192|  15.9k|        return;
 3193|  15.9k|      }
 3194|   492k|    }
 3195|   135k|    signal_parse_sb_row_done(pbi, tile_data, sb_mi_size);
 3196|   135k|  }
 3197|       |
 3198|  71.2k|  int corrupted =
 3199|  71.2k|      (check_trailing_bits_after_symbol_coder(td->bit_reader)) ? 1 : 0;
  ------------------
  |  Branch (3199:7): [True: 5.37k, False: 65.9k]
  ------------------
 3200|  71.2k|  aom_merge_corrupted_flag(&dcb->corrupted, corrupted);
 3201|  71.2k|}
decodeframe.c:set_cb_buffer:
 2456|  1.61M|                                 const int num_planes, int mi_row, int mi_col) {
 2457|  1.61M|  AV1_COMMON *const cm = &pbi->common;
 2458|  1.61M|  int mib_size_log2 = cm->seq_params->mib_size_log2;
 2459|  1.61M|  int stride = (cm->mi_params.mi_cols >> mib_size_log2) + 1;
 2460|  1.61M|  int offset = (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);
 2461|  1.61M|  CB_BUFFER *cb_buffer = cb_buffer_base + offset;
 2462|       |
 2463|  6.19M|  for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (2463:23): [True: 4.57M, False: 1.61M]
  ------------------
 2464|  4.57M|    dcb->dqcoeff_block[plane] = cb_buffer->dqcoeff[plane];
 2465|  4.57M|    dcb->eob_data[plane] = cb_buffer->eob_data[plane];
 2466|  4.57M|    dcb->cb_offset[plane] = 0;
 2467|  4.57M|    dcb->txb_offset[plane] = 0;
 2468|  4.57M|  }
 2469|  1.61M|  MACROBLOCKD *const xd = &dcb->xd;
 2470|  1.61M|  xd->plane[0].color_index_map = cb_buffer->color_index_map[0];
 2471|  1.61M|  xd->plane[1].color_index_map = cb_buffer->color_index_map[1];
 2472|  1.61M|  xd->color_index_map_offset[0] = 0;
 2473|  1.61M|  xd->color_index_map_offset[1] = 0;
 2474|  1.61M|}
decodeframe.c:decode_partition:
 1256|  17.3M|                                    BLOCK_SIZE bsize, int parse_decode_flag) {
 1257|  17.3M|  assert(bsize < BLOCK_SIZES_ALL);
 1258|  17.3M|  AV1_COMMON *const cm = &pbi->common;
 1259|  17.3M|  DecoderCodingBlock *const dcb = &td->dcb;
 1260|  17.3M|  MACROBLOCKD *const xd = &dcb->xd;
 1261|  17.3M|  const int bw = mi_size_wide[bsize];
 1262|  17.3M|  const int hbs = bw >> 1;
 1263|  17.3M|  PARTITION_TYPE partition;
 1264|  17.3M|  BLOCK_SIZE subsize;
 1265|  17.3M|  const int quarter_step = bw / 4;
 1266|  17.3M|  BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
 1267|  17.3M|  const int has_rows = (mi_row + hbs) < cm->mi_params.mi_rows;
 1268|  17.3M|  const int has_cols = (mi_col + hbs) < cm->mi_params.mi_cols;
 1269|       |
 1270|  17.3M|  if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols)
  ------------------
  |  Branch (1270:7): [True: 429k, False: 16.9M]
  |  Branch (1270:42): [True: 334k, False: 16.5M]
  ------------------
 1271|   750k|    return;
 1272|       |
 1273|       |  // parse_decode_flag takes the following values :
 1274|       |  // 01 - do parse only
 1275|       |  // 10 - do decode only
 1276|       |  // 11 - do parse and decode
 1277|  16.5M|  static const block_visitor_fn_t block_visit[4] = { NULL, parse_decode_block,
 1278|  16.5M|                                                     decode_block,
 1279|  16.5M|                                                     parse_decode_block };
 1280|       |
 1281|  16.5M|  if (parse_decode_flag & 1) {
  ------------------
  |  Branch (1281:7): [True: 11.3M, False: 5.25M]
  ------------------
 1282|  11.3M|    const int num_planes = av1_num_planes(cm);
 1283|  44.3M|    for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (1283:25): [True: 32.9M, False: 11.3M]
  ------------------
 1284|  32.9M|      int rcol0, rcol1, rrow0, rrow1;
 1285|       |
 1286|       |      // Skip some unnecessary work if loop restoration is disabled
 1287|  32.9M|      if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
  ------------------
  |  Branch (1287:11): [True: 15.6M, False: 17.3M]
  ------------------
 1288|       |
 1289|  17.3M|      if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
  ------------------
  |  Branch (1289:11): [True: 252k, False: 17.0M]
  ------------------
 1290|  17.3M|                                             &rcol0, &rcol1, &rrow0, &rrow1)) {
 1291|   252k|        const int rstride = cm->rst_info[plane].horz_units;
 1292|   504k|        for (int rrow = rrow0; rrow < rrow1; ++rrow) {
  ------------------
  |  Branch (1292:32): [True: 252k, False: 252k]
  ------------------
 1293|   507k|          for (int rcol = rcol0; rcol < rcol1; ++rcol) {
  ------------------
  |  Branch (1293:34): [True: 255k, False: 252k]
  ------------------
 1294|   255k|            const int runit_idx = rcol + rrow * rstride;
 1295|   255k|            loop_restoration_read_sb_coeffs(cm, xd, reader, plane, runit_idx);
 1296|   255k|          }
 1297|   252k|        }
 1298|   252k|      }
 1299|  17.3M|    }
 1300|       |
 1301|  11.3M|    partition = (bsize < BLOCK_8X8) ? PARTITION_NONE
  ------------------
  |  Branch (1301:17): [True: 751k, False: 10.5M]
  ------------------
 1302|  11.3M|                                    : read_partition(xd, mi_row, mi_col, reader,
 1303|  10.5M|                                                     has_rows, has_cols, bsize);
 1304|  11.3M|  } else {
 1305|  5.25M|    partition = get_partition(cm, mi_row, mi_col, bsize);
 1306|  5.25M|  }
 1307|  16.5M|  subsize = get_partition_subsize(bsize, partition);
 1308|  16.5M|  if (subsize == BLOCK_INVALID) {
  ------------------
  |  Branch (1308:7): [True: 0, False: 16.5M]
  ------------------
 1309|       |    // When an internal error occurs ensure that xd->mi_row is set appropriately
 1310|       |    // w.r.t. current tile, which is used to signal processing of current row is
 1311|       |    // done.
 1312|      0|    xd->mi_row = mi_row;
 1313|      0|    aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
 1314|      0|                       "Partition is invalid for block size %dx%d",
 1315|      0|                       block_size_wide[bsize], block_size_high[bsize]);
 1316|      0|  }
 1317|       |  // Check the bitstream is conformant: if there is subsampling on the
 1318|       |  // chroma planes, subsize must subsample to a valid block size.
 1319|  16.5M|  const struct macroblockd_plane *const pd_u = &xd->plane[1];
 1320|  16.5M|  if (get_plane_block_size(subsize, pd_u->subsampling_x, pd_u->subsampling_y) ==
  ------------------
  |  Branch (1320:7): [True: 1.02k, False: 16.5M]
  ------------------
 1321|  16.5M|      BLOCK_INVALID) {
 1322|       |    // When an internal error occurs ensure that xd->mi_row is set appropriately
 1323|       |    // w.r.t. current tile, which is used to signal processing of current row is
 1324|       |    // done.
 1325|  1.02k|    xd->mi_row = mi_row;
 1326|  1.02k|    aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
 1327|  1.02k|                       "Block size %dx%d invalid with this subsampling mode",
 1328|  1.02k|                       block_size_wide[subsize], block_size_high[subsize]);
 1329|  1.02k|  }
 1330|       |
 1331|  16.5M|#define DEC_BLOCK_STX_ARG
 1332|  16.5M|#define DEC_BLOCK_EPT_ARG partition,
 1333|  16.5M|#define DEC_BLOCK(db_r, db_c, db_subsize)                                  \
 1334|  16.5M|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
 1335|  16.5M|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
 1336|  16.5M|#define DEC_PARTITION(db_r, db_c, db_subsize)                        \
 1337|  16.5M|  decode_partition(pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), reader, \
 1338|  16.5M|                   (db_subsize), parse_decode_flag)
 1339|       |
 1340|  16.5M|  switch (partition) {
 1341|  6.73M|    case PARTITION_NONE: DEC_BLOCK(mi_row, mi_col, subsize); break;
  ------------------
  |  | 1334|  6.73M|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|  6.73M|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|  6.73M|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
  |  Branch (1341:5): [True: 6.73M, False: 9.85M]
  ------------------
 1342|  1.83M|    case PARTITION_HORZ:
  ------------------
  |  Branch (1342:5): [True: 1.83M, False: 14.7M]
  ------------------
 1343|  1.83M|      DEC_BLOCK(mi_row, mi_col, subsize);
  ------------------
  |  | 1334|  1.83M|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|  1.83M|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|  1.83M|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1344|  1.83M|      if (has_rows) DEC_BLOCK(mi_row + hbs, mi_col, subsize);
  ------------------
  |  | 1334|  1.73M|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|  1.73M|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|  1.73M|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
  |  Branch (1344:11): [True: 1.73M, False: 97.7k]
  ------------------
 1345|  1.83M|      break;
 1346|  1.23M|    case PARTITION_VERT:
  ------------------
  |  Branch (1346:5): [True: 1.23M, False: 15.3M]
  ------------------
 1347|  1.23M|      DEC_BLOCK(mi_row, mi_col, subsize);
  ------------------
  |  | 1334|  1.23M|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|  1.23M|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|  1.23M|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1348|  1.23M|      if (has_cols) DEC_BLOCK(mi_row, mi_col + hbs, subsize);
  ------------------
  |  | 1334|  1.07M|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|  1.07M|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|  1.07M|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
  |  Branch (1348:11): [True: 1.07M, False: 166k]
  ------------------
 1349|  1.23M|      break;
 1350|  3.94M|    case PARTITION_SPLIT:
  ------------------
  |  Branch (1350:5): [True: 3.94M, False: 12.6M]
  ------------------
 1351|  3.94M|      DEC_PARTITION(mi_row, mi_col, subsize);
  ------------------
  |  | 1337|  3.94M|  decode_partition(pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), reader, \
  |  | 1338|  3.94M|                   (db_subsize), parse_decode_flag)
  ------------------
 1352|  3.94M|      DEC_PARTITION(mi_row, mi_col + hbs, subsize);
  ------------------
  |  | 1337|  3.94M|  decode_partition(pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), reader, \
  |  | 1338|  3.94M|                   (db_subsize), parse_decode_flag)
  ------------------
 1353|  3.94M|      DEC_PARTITION(mi_row + hbs, mi_col, subsize);
  ------------------
  |  | 1337|  3.94M|  decode_partition(pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), reader, \
  |  | 1338|  3.94M|                   (db_subsize), parse_decode_flag)
  ------------------
 1354|  3.94M|      DEC_PARTITION(mi_row + hbs, mi_col + hbs, subsize);
  ------------------
  |  | 1337|  3.94M|  decode_partition(pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), reader, \
  |  | 1338|  3.94M|                   (db_subsize), parse_decode_flag)
  ------------------
 1355|  3.94M|      break;
 1356|   363k|    case PARTITION_HORZ_A:
  ------------------
  |  Branch (1356:5): [True: 363k, False: 16.2M]
  ------------------
 1357|   363k|      DEC_BLOCK(mi_row, mi_col, bsize2);
  ------------------
  |  | 1334|   363k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|   363k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|   363k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1358|   363k|      DEC_BLOCK(mi_row, mi_col + hbs, bsize2);
  ------------------
  |  | 1334|   363k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|   363k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|   363k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1359|   363k|      DEC_BLOCK(mi_row + hbs, mi_col, subsize);
  ------------------
  |  | 1334|   363k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|   363k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|   363k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1360|   363k|      break;
 1361|   325k|    case PARTITION_HORZ_B:
  ------------------
  |  Branch (1361:5): [True: 325k, False: 16.2M]
  ------------------
 1362|   325k|      DEC_BLOCK(mi_row, mi_col, subsize);
  ------------------
  |  | 1334|   325k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|   325k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|   325k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1363|   325k|      DEC_BLOCK(mi_row + hbs, mi_col, bsize2);
  ------------------
  |  | 1334|   325k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|   325k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|   325k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1364|   325k|      DEC_BLOCK(mi_row + hbs, mi_col + hbs, bsize2);
  ------------------
  |  | 1334|   325k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|   325k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|   325k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1365|   325k|      break;
 1366|   221k|    case PARTITION_VERT_A:
  ------------------
  |  Branch (1366:5): [True: 221k, False: 16.3M]
  ------------------
 1367|   221k|      DEC_BLOCK(mi_row, mi_col, bsize2);
  ------------------
  |  | 1334|   221k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|   221k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|   221k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1368|   221k|      DEC_BLOCK(mi_row + hbs, mi_col, bsize2);
  ------------------
  |  | 1334|   221k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|   221k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|   221k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1369|   221k|      DEC_BLOCK(mi_row, mi_col + hbs, subsize);
  ------------------
  |  | 1334|   221k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|   221k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|   221k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1370|   221k|      break;
 1371|   254k|    case PARTITION_VERT_B:
  ------------------
  |  Branch (1371:5): [True: 254k, False: 16.3M]
  ------------------
 1372|   254k|      DEC_BLOCK(mi_row, mi_col, subsize);
  ------------------
  |  | 1334|   254k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|   254k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|   254k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1373|   254k|      DEC_BLOCK(mi_row, mi_col + hbs, bsize2);
  ------------------
  |  | 1334|   254k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|   254k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|   254k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1374|   254k|      DEC_BLOCK(mi_row + hbs, mi_col + hbs, bsize2);
  ------------------
  |  | 1334|   254k|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|   254k|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|   254k|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1375|   254k|      break;
 1376|  1.08M|    case PARTITION_HORZ_4:
  ------------------
  |  Branch (1376:5): [True: 1.08M, False: 15.5M]
  ------------------
 1377|  5.43M|      for (int i = 0; i < 4; ++i) {
  ------------------
  |  Branch (1377:23): [True: 4.35M, False: 1.08M]
  ------------------
 1378|  4.35M|        int this_mi_row = mi_row + i * quarter_step;
 1379|  4.35M|        if (i > 0 && this_mi_row >= cm->mi_params.mi_rows) break;
  ------------------
  |  Branch (1379:13): [True: 3.26M, False: 1.08M]
  |  Branch (1379:22): [True: 8.04k, False: 3.25M]
  ------------------
 1380|  4.34M|        DEC_BLOCK(this_mi_row, mi_col, subsize);
  ------------------
  |  | 1334|  4.34M|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|  4.34M|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|  4.34M|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1381|  4.34M|      }
 1382|  1.08M|      break;
 1383|   601k|    case PARTITION_VERT_4:
  ------------------
  |  Branch (1383:5): [True: 601k, False: 15.9M]
  ------------------
 1384|  2.99M|      for (int i = 0; i < 4; ++i) {
  ------------------
  |  Branch (1384:23): [True: 2.40M, False: 593k]
  ------------------
 1385|  2.40M|        int this_mi_col = mi_col + i * quarter_step;
 1386|  2.40M|        if (i > 0 && this_mi_col >= cm->mi_params.mi_cols) break;
  ------------------
  |  Branch (1386:13): [True: 1.80M, False: 601k]
  |  Branch (1386:22): [True: 8.08k, False: 1.79M]
  ------------------
 1387|  2.39M|        DEC_BLOCK(mi_row, this_mi_col, subsize);
  ------------------
  |  | 1334|  2.39M|  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
  |  | 1335|  2.39M|                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
  |  |  ------------------
  |  |  |  | 1332|  2.39M|#define DEC_BLOCK_EPT_ARG partition,
  |  |  ------------------
  ------------------
 1388|  2.39M|      }
 1389|   601k|      break;
 1390|      0|    default: assert(0 && "Invalid partition type");
  ------------------
  |  Branch (1390:5): [True: 0, False: 16.5M]
  ------------------
 1391|  16.5M|  }
 1392|       |
 1393|  16.5M|#undef DEC_PARTITION
 1394|  16.5M|#undef DEC_BLOCK
 1395|  16.5M|#undef DEC_BLOCK_EPT_ARG
 1396|  16.5M|#undef DEC_BLOCK_STX_ARG
 1397|       |
 1398|  16.5M|  if (parse_decode_flag & 1)
  ------------------
  |  Branch (1398:7): [True: 11.3M, False: 5.25M]
  ------------------
 1399|  11.3M|    update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
 1400|  16.5M|}
decodeframe.c:parse_decode_block:
 1130|  15.6M|                                      BLOCK_SIZE bsize) {
 1131|  15.6M|  DecoderCodingBlock *const dcb = &td->dcb;
 1132|  15.6M|  MACROBLOCKD *const xd = &dcb->xd;
 1133|  15.6M|  decode_mbmi_block(pbi, dcb, mi_row, mi_col, r, partition, bsize);
 1134|       |
 1135|  15.6M|  av1_visit_palette(pbi, xd, r, av1_decode_palette_tokens);
 1136|       |
 1137|  15.6M|  AV1_COMMON *cm = &pbi->common;
 1138|  15.6M|  const int num_planes = av1_num_planes(cm);
 1139|  15.6M|  MB_MODE_INFO *mbmi = xd->mi[0];
 1140|  15.6M|  int inter_block_tx = is_inter_block(mbmi) || is_intrabc_block(mbmi);
  ------------------
  |  Branch (1140:24): [True: 4.61M, False: 10.9M]
  |  Branch (1140:48): [True: 529, False: 10.9M]
  ------------------
 1141|  15.6M|  if (cm->features.tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) &&
  ------------------
  |  Branch (1141:7): [True: 3.13M, False: 12.4M]
  |  Branch (1141:49): [True: 2.98M, False: 142k]
  ------------------
 1142|  15.6M|      !mbmi->skip_txfm && inter_block_tx && !xd->lossless[mbmi->segment_id]) {
  ------------------
  |  Branch (1142:7): [True: 2.48M, False: 507k]
  |  Branch (1142:27): [True: 953k, False: 1.52M]
  |  Branch (1142:45): [True: 952k, False: 572]
  ------------------
 1143|   952k|    const TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
 1144|   952k|    const int bh = tx_size_high_unit[max_tx_size];
 1145|   952k|    const int bw = tx_size_wide_unit[max_tx_size];
 1146|   952k|    const int width = mi_size_wide[bsize];
 1147|   952k|    const int height = mi_size_high[bsize];
 1148|       |
 1149|  1.91M|    for (int idy = 0; idy < height; idy += bh)
  ------------------
  |  Branch (1149:23): [True: 963k, False: 952k]
  ------------------
 1150|  1.94M|      for (int idx = 0; idx < width; idx += bw)
  ------------------
  |  Branch (1150:25): [True: 983k, False: 963k]
  ------------------
 1151|   983k|        read_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, r);
 1152|  14.6M|  } else {
 1153|  14.6M|    mbmi->tx_size = read_tx_size(xd, cm->features.tx_mode, inter_block_tx,
 1154|  14.6M|                                 !mbmi->skip_txfm, r);
 1155|  14.6M|    if (inter_block_tx)
  ------------------
  |  Branch (1155:9): [True: 3.65M, False: 11.0M]
  ------------------
 1156|  3.65M|      memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
 1157|  14.6M|    set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height,
 1158|  14.6M|                  mbmi->skip_txfm && is_inter_block(mbmi), xd);
  ------------------
  |  Branch (1158:19): [True: 6.08M, False: 8.57M]
  |  Branch (1158:38): [True: 1.20M, False: 4.87M]
  ------------------
 1159|  14.6M|  }
 1160|       |
 1161|  15.6M|  if (cm->delta_q_info.delta_q_present_flag) {
  ------------------
  |  Branch (1161:7): [True: 2.16M, False: 13.4M]
  ------------------
 1162|  19.4M|    for (int i = 0; i < MAX_SEGMENTS; i++) {
  ------------------
  |  |   21|  19.4M|#define MAX_SEGMENTS 8
  ------------------
  |  Branch (1162:21): [True: 17.3M, False: 2.16M]
  ------------------
 1163|  17.3M|      const int current_qindex =
 1164|  17.3M|          av1_get_qindex(&cm->seg, i, xd->current_base_qindex);
 1165|  17.3M|      const CommonQuantParams *const quant_params = &cm->quant_params;
 1166|  63.4M|      for (int j = 0; j < num_planes; ++j) {
  ------------------
  |  Branch (1166:23): [True: 46.0M, False: 17.3M]
  ------------------
 1167|  46.0M|        const int dc_delta_q = j == 0 ? quant_params->y_dc_delta_q
  ------------------
  |  Branch (1167:32): [True: 17.3M, False: 28.7M]
  ------------------
 1168|  46.0M|                                      : (j == 1 ? quant_params->u_dc_delta_q
  ------------------
  |  Branch (1168:42): [True: 14.4M, False: 14.3M]
  ------------------
 1169|  28.7M|                                                : quant_params->v_dc_delta_q);
 1170|  46.0M|        const int ac_delta_q = j == 0 ? 0
  ------------------
  |  Branch (1170:32): [True: 17.3M, False: 28.7M]
  ------------------
 1171|  46.0M|                                      : (j == 1 ? quant_params->u_ac_delta_q
  ------------------
  |  Branch (1171:42): [True: 14.4M, False: 14.3M]
  ------------------
 1172|  28.7M|                                                : quant_params->v_ac_delta_q);
 1173|  46.0M|        xd->plane[j].seg_dequant_QTX[i][0] = av1_dc_quant_QTX(
 1174|  46.0M|            current_qindex, dc_delta_q, cm->seq_params->bit_depth);
 1175|  46.0M|        xd->plane[j].seg_dequant_QTX[i][1] = av1_ac_quant_QTX(
 1176|  46.0M|            current_qindex, ac_delta_q, cm->seq_params->bit_depth);
 1177|  46.0M|      }
 1178|  17.3M|    }
 1179|  2.16M|  }
 1180|  15.6M|  if (mbmi->skip_txfm) av1_reset_entropy_context(xd, bsize, num_planes);
  ------------------
  |  Branch (1180:7): [True: 6.08M, False: 9.52M]
  ------------------
 1181|       |
 1182|  15.6M|  decode_token_recon_block(pbi, td, r, bsize);
 1183|  15.6M|}
decodeframe.c:decode_mbmi_block:
  375|  15.6M|                                     BLOCK_SIZE bsize) {
  376|  15.6M|  AV1_COMMON *const cm = &pbi->common;
  377|  15.6M|  const SequenceHeader *const seq_params = cm->seq_params;
  378|  15.6M|  const int bw = mi_size_wide[bsize];
  379|  15.6M|  const int bh = mi_size_high[bsize];
  380|  15.6M|  const int x_mis = AOMMIN(bw, cm->mi_params.mi_cols - mi_col);
  ------------------
  |  |   34|  15.6M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 15.0M, False: 522k]
  |  |  ------------------
  ------------------
  381|  15.6M|  const int y_mis = AOMMIN(bh, cm->mi_params.mi_rows - mi_row);
  ------------------
  |  |   34|  15.6M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 15.2M, False: 353k]
  |  |  ------------------
  ------------------
  382|  15.6M|  MACROBLOCKD *const xd = &dcb->xd;
  383|       |
  384|       |#if CONFIG_ACCOUNTING
  385|       |  aom_accounting_set_context(&pbi->accounting, mi_col, mi_row);
  386|       |#endif
  387|  15.6M|  set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
  388|  15.6M|  xd->mi[0]->partition = partition;
  389|  15.6M|  av1_read_mode_info(pbi, dcb, r, x_mis, y_mis);
  390|  15.6M|  if (bsize >= BLOCK_8X8 &&
  ------------------
  |  Branch (390:7): [True: 13.2M, False: 2.31M]
  ------------------
  391|  15.6M|      (seq_params->subsampling_x || seq_params->subsampling_y)) {
  ------------------
  |  Branch (391:8): [True: 8.82M, False: 4.47M]
  |  Branch (391:37): [True: 18.4E, False: 4.47M]
  ------------------
  392|  8.82M|    const BLOCK_SIZE uv_subsize =
  393|  8.82M|        av1_ss_size_lookup[bsize][seq_params->subsampling_x]
  394|  8.82M|                          [seq_params->subsampling_y];
  395|  8.82M|    if (uv_subsize == BLOCK_INVALID)
  ------------------
  |  Branch (395:9): [True: 0, False: 8.82M]
  ------------------
  396|      0|      aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
  397|      0|                         "Invalid block size.");
  398|  8.82M|  }
  399|  15.6M|}
decodeframe.c:set_offsets:
  339|  15.6M|                               int bh, int x_mis, int y_mis) {
  340|  15.6M|  const int num_planes = av1_num_planes(cm);
  341|  15.6M|  const CommonModeInfoParams *const mi_params = &cm->mi_params;
  342|  15.6M|  const TileInfo *const tile = &xd->tile;
  343|       |
  344|  15.6M|  set_mi_offsets(mi_params, xd, mi_row, mi_col);
  345|  15.6M|  xd->mi[0]->bsize = bsize;
  346|       |#if CONFIG_RD_DEBUG
  347|       |  xd->mi[0]->mi_row = mi_row;
  348|       |  xd->mi[0]->mi_col = mi_col;
  349|       |#endif
  350|       |
  351|  15.6M|  assert(x_mis && y_mis);
  352|  78.0M|  for (int x = 1; x < x_mis; ++x) xd->mi[x] = xd->mi[0];
  ------------------
  |  Branch (352:19): [True: 62.4M, False: 15.6M]
  ------------------
  353|  15.6M|  int idx = mi_params->mi_stride;
  354|  73.1M|  for (int y = 1; y < y_mis; ++y) {
  ------------------
  |  Branch (354:19): [True: 57.4M, False: 15.6M]
  ------------------
  355|  57.4M|    memcpy(&xd->mi[idx], &xd->mi[0], x_mis * sizeof(xd->mi[0]));
  356|  57.4M|    idx += mi_params->mi_stride;
  357|  57.4M|  }
  358|       |
  359|  15.6M|  set_plane_n4(xd, bw, bh, num_planes);
  360|  15.6M|  set_entropy_context(xd, mi_row, mi_col, num_planes);
  361|       |
  362|       |  // Distance of Mb to the various image edges. These are specified to 8th pel
  363|       |  // as they are always compared to values that are in 1/8th pel units
  364|  15.6M|  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows,
  365|  15.6M|                 mi_params->mi_cols);
  366|       |
  367|  15.6M|  av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0,
  368|  15.6M|                       num_planes);
  369|  15.6M|}
decodeframe.c:read_tx_size_vartx:
 1030|  1.68M|                                      int blk_col, aom_reader *r) {
 1031|  1.68M|  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 1032|  1.68M|  int is_split = 0;
 1033|  1.68M|  const BLOCK_SIZE bsize = mbmi->bsize;
 1034|  1.68M|  const int max_blocks_high = max_block_high(xd, bsize, 0);
 1035|  1.68M|  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
 1036|  1.68M|  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
  ------------------
  |  Branch (1036:7): [True: 375, False: 1.68M]
  |  Branch (1036:37): [True: 1.59k, False: 1.68M]
  ------------------
 1037|  1.68M|  assert(tx_size > TX_4X4);
 1038|  1.68M|  TX_SIZE txs = max_txsize_rect_lookup[bsize];
 1039|  3.36M|  for (int level = 0; level < MAX_VARTX_DEPTH - 1; ++level)
  ------------------
  |  |   56|  3.36M|#define MAX_VARTX_DEPTH 2
  ------------------
  |  Branch (1039:23): [True: 1.68M, False: 1.68M]
  ------------------
 1040|  1.68M|    txs = sub_tx_size_map[txs];
 1041|  1.68M|  const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2;
  ------------------
  |  |   39|  1.68M|#define MI_SIZE_LOG2 2
  ------------------
 1042|  1.68M|  const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2;
  ------------------
  |  |   39|  1.68M|#define MI_SIZE_LOG2 2
  ------------------
 1043|  1.68M|  const int bw_log2 = mi_size_wide_log2[bsize];
 1044|  1.68M|  const int stride_log2 = bw_log2 - tx_w_log2;
 1045|       |
 1046|  1.68M|  if (depth == MAX_VARTX_DEPTH) {
  ------------------
  |  |   56|  1.68M|#define MAX_VARTX_DEPTH 2
  ------------------
  |  Branch (1046:7): [True: 187k, False: 1.49M]
  ------------------
 1047|   187k|    set_inter_tx_size(mbmi, stride_log2, tx_w_log2, tx_h_log2, txs, tx_size,
 1048|   187k|                      tx_size, blk_row, blk_col);
 1049|   187k|    mbmi->tx_size = tx_size;
 1050|   187k|    txfm_partition_update(xd->above_txfm_context + blk_col,
 1051|   187k|                          xd->left_txfm_context + blk_row, tx_size, tx_size);
 1052|   187k|    return;
 1053|   187k|  }
 1054|       |
 1055|  1.49M|  const int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
 1056|  1.49M|                                         xd->left_txfm_context + blk_row,
 1057|  1.49M|                                         mbmi->bsize, tx_size);
 1058|  1.49M|  is_split = aom_read_symbol(r, ec_ctx->txfm_partition_cdf[ctx], 2, ACCT_STR);
  ------------------
  |  |   51|  1.49M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1059|       |
 1060|  1.49M|  if (is_split) {
  ------------------
  |  Branch (1060:7): [True: 383k, False: 1.11M]
  ------------------
 1061|   383k|    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
 1062|   383k|    const int bsw = tx_size_wide_unit[sub_txs];
 1063|   383k|    const int bsh = tx_size_high_unit[sub_txs];
 1064|       |
 1065|   383k|    if (sub_txs == TX_4X4) {
  ------------------
  |  Branch (1065:9): [True: 128k, False: 254k]
  ------------------
 1066|   128k|      set_inter_tx_size(mbmi, stride_log2, tx_w_log2, tx_h_log2, txs, tx_size,
 1067|   128k|                        sub_txs, blk_row, blk_col);
 1068|   128k|      mbmi->tx_size = sub_txs;
 1069|   128k|      txfm_partition_update(xd->above_txfm_context + blk_col,
 1070|   128k|                            xd->left_txfm_context + blk_row, sub_txs, tx_size);
 1071|   128k|      return;
 1072|   128k|    }
 1073|       |
 1074|   254k|    assert(bsw > 0 && bsh > 0);
 1075|   679k|    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
  ------------------
  |  Branch (1075:23): [True: 424k, False: 254k]
  ------------------
 1076|  1.12M|      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
  ------------------
  |  Branch (1076:25): [True: 702k, False: 424k]
  ------------------
 1077|   702k|        int offsetr = blk_row + row;
 1078|   702k|        int offsetc = blk_col + col;
 1079|   702k|        read_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, r);
 1080|   702k|      }
 1081|   424k|    }
 1082|  1.11M|  } else {
 1083|  1.11M|    set_inter_tx_size(mbmi, stride_log2, tx_w_log2, tx_h_log2, txs, tx_size,
 1084|  1.11M|                      tx_size, blk_row, blk_col);
 1085|  1.11M|    mbmi->tx_size = tx_size;
 1086|  1.11M|    txfm_partition_update(xd->above_txfm_context + blk_col,
 1087|  1.11M|                          xd->left_txfm_context + blk_row, tx_size, tx_size);
 1088|  1.11M|  }
 1089|  1.49M|}
decodeframe.c:set_inter_tx_size:
 1016|  1.42M|                                     int blk_col) {
 1017|  3.37M|  for (int idy = 0; idy < tx_size_high_unit[split_size];
  ------------------
  |  Branch (1017:21): [True: 1.94M, False: 1.42M]
  ------------------
 1018|  1.94M|       idy += tx_size_high_unit[min_txs]) {
 1019|  4.85M|    for (int idx = 0; idx < tx_size_wide_unit[split_size];
  ------------------
  |  Branch (1019:23): [True: 2.90M, False: 1.94M]
  ------------------
 1020|  2.90M|         idx += tx_size_wide_unit[min_txs]) {
 1021|  2.90M|      const int index = (((blk_row + idy) >> tx_h_log2) << stride_log2) +
 1022|  2.90M|                        ((blk_col + idx) >> tx_w_log2);
 1023|  2.90M|      mbmi->inter_tx_size[index] = txs;
 1024|  2.90M|    }
 1025|  1.94M|  }
 1026|  1.42M|}
decodeframe.c:read_tx_size:
 1109|  14.6M|                            aom_reader *r) {
 1110|  14.6M|  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
 1111|  14.6M|  if (xd->lossless[xd->mi[0]->segment_id]) return TX_4X4;
  ------------------
  |  Branch (1111:7): [True: 122k, False: 14.5M]
  ------------------
 1112|       |
 1113|  14.5M|  if (block_signals_txsize(bsize)) {
  ------------------
  |  Branch (1113:7): [True: 13.7M, False: 747k]
  ------------------
 1114|  13.7M|    if ((!is_inter || allow_select_inter) && tx_mode == TX_MODE_SELECT) {
  ------------------
  |  Branch (1114:10): [True: 10.3M, False: 3.38M]
  |  Branch (1114:23): [True: 2.25M, False: 1.12M]
  |  Branch (1114:46): [True: 1.63M, False: 11.0M]
  ------------------
 1115|  1.63M|      const TX_SIZE coded_tx_size = read_selected_tx_size(xd, r);
 1116|  1.63M|      return coded_tx_size;
 1117|  12.1M|    } else {
 1118|  12.1M|      return tx_size_from_tx_mode(bsize, tx_mode);
 1119|  12.1M|    }
 1120|  13.7M|  } else {
 1121|   747k|    assert(IMPLIES(tx_mode == ONLY_4X4, bsize == BLOCK_4X4));
 1122|   748k|    return max_txsize_rect_lookup[bsize];
 1123|   747k|  }
 1124|  14.5M|}
decodeframe.c:read_selected_tx_size:
 1092|  1.63M|                                     aom_reader *r) {
 1093|       |  // TODO(debargha): Clean up the logic here. This function should only
 1094|       |  // be called for intra.
 1095|  1.63M|  const BLOCK_SIZE bsize = xd->mi[0]->bsize;
 1096|  1.63M|  const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
 1097|  1.63M|  const int max_depths = bsize_to_max_depth(bsize);
 1098|  1.63M|  const int ctx = get_tx_size_context(xd);
 1099|  1.63M|  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 1100|  1.63M|  const int depth = aom_read_symbol(r, ec_ctx->tx_size_cdf[tx_size_cat][ctx],
  ------------------
  |  |   51|  1.63M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1101|  1.63M|                                    max_depths + 1, ACCT_STR);
 1102|  1.63M|  assert(depth >= 0 && depth <= max_depths);
 1103|  1.63M|  const TX_SIZE tx_size = depth_to_tx_size(depth, bsize);
 1104|  1.63M|  return tx_size;
 1105|  1.63M|}
decodeframe.c:decode_token_recon_block:
  904|  22.8M|                                            BLOCK_SIZE bsize) {
  905|  22.8M|  AV1_COMMON *const cm = &pbi->common;
  906|  22.8M|  DecoderCodingBlock *const dcb = &td->dcb;
  907|  22.8M|  MACROBLOCKD *const xd = &dcb->xd;
  908|  22.8M|  const int num_planes = av1_num_planes(cm);
  909|  22.8M|  MB_MODE_INFO *mbmi = xd->mi[0];
  910|       |
  911|  22.8M|  if (!is_inter_block(mbmi)) {
  ------------------
  |  Branch (911:7): [True: 14.8M, False: 7.92M]
  ------------------
  912|  14.8M|    int row, col;
  913|  14.8M|    assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x,
  914|  14.8M|                                         xd->plane[0].subsampling_y));
  915|  14.8M|    const int max_blocks_wide = max_block_wide(xd, bsize, 0);
  916|  14.8M|    const int max_blocks_high = max_block_high(xd, bsize, 0);
  917|  14.8M|    const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
  918|  14.8M|    int mu_blocks_wide = mi_size_wide[max_unit_bsize];
  919|  14.8M|    int mu_blocks_high = mi_size_high[max_unit_bsize];
  920|  14.8M|    mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
  ------------------
  |  |   34|  14.8M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 13.2M, False: 1.66M]
  |  |  ------------------
  ------------------
  921|  14.8M|    mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
  ------------------
  |  |   34|  14.8M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 13.4M, False: 1.40M]
  |  |  ------------------
  ------------------
  922|       |
  923|  29.9M|    for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
  ------------------
  |  Branch (923:19): [True: 15.0M, False: 14.8M]
  ------------------
  924|  30.7M|      for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
  ------------------
  |  Branch (924:21): [True: 15.6M, False: 15.0M]
  ------------------
  925|  58.8M|        for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (925:29): [True: 44.0M, False: 14.7M]
  ------------------
  926|  44.0M|          if (plane && !xd->is_chroma_ref) break;
  ------------------
  |  Branch (926:15): [True: 28.5M, False: 15.5M]
  |  Branch (926:24): [True: 874k, False: 27.6M]
  ------------------
  927|  43.1M|          const struct macroblockd_plane *const pd = &xd->plane[plane];
  928|  43.1M|          const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
  929|  43.1M|          const int stepr = tx_size_high_unit[tx_size];
  930|  43.1M|          const int stepc = tx_size_wide_unit[tx_size];
  931|       |
  932|  43.1M|          const int unit_height = ROUND_POWER_OF_TWO(
  ------------------
  |  |   41|  86.3M|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (41:41): [True: 1.21M, False: 41.9M]
  |  |  ------------------
  ------------------
  933|  43.1M|              AOMMIN(mu_blocks_high + row, max_blocks_high), pd->subsampling_y);
  934|  43.1M|          const int unit_width = ROUND_POWER_OF_TWO(
  ------------------
  |  |   41|  86.3M|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (41:41): [True: 1.16M, False: 42.0M]
  |  |  ------------------
  ------------------
  935|  43.1M|              AOMMIN(mu_blocks_wide + col, max_blocks_wide), pd->subsampling_x);
  936|       |
  937|  99.7M|          for (int blk_row = row >> pd->subsampling_y; blk_row < unit_height;
  ------------------
  |  Branch (937:56): [True: 56.6M, False: 43.1M]
  ------------------
  938|  56.6M|               blk_row += stepr) {
  939|   242M|            for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width;
  ------------------
  |  Branch (939:58): [True: 185M, False: 56.6M]
  ------------------
  940|   185M|                 blk_col += stepc) {
  941|   185M|              td->read_coeffs_tx_intra_block_visit(cm, dcb, r, plane, blk_row,
  942|   185M|                                                   blk_col, tx_size);
  943|   185M|              td->predict_and_recon_intra_block_visit(
  944|   185M|                  cm, dcb, r, plane, blk_row, blk_col, tx_size);
  945|   185M|              set_cb_buffer_offsets(dcb, tx_size, plane);
  946|   185M|            }
  947|  56.6M|          }
  948|  43.1M|        }
  949|  15.6M|      }
  950|  15.0M|    }
  951|  14.8M|  } else {
  952|  7.92M|    td->predict_inter_block_visit(cm, dcb, bsize);
  953|       |    // Reconstruction
  954|  7.92M|    if (!mbmi->skip_txfm) {
  ------------------
  |  Branch (954:9): [True: 6.07M, False: 1.84M]
  ------------------
  955|  6.07M|      int eobtotal = 0;
  956|       |
  957|  6.07M|      const int max_blocks_wide = max_block_wide(xd, bsize, 0);
  958|  6.07M|      const int max_blocks_high = max_block_high(xd, bsize, 0);
  959|  6.07M|      int row, col;
  960|       |
  961|  6.07M|      const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
  962|  6.07M|      assert(max_unit_bsize ==
  963|  6.07M|             get_plane_block_size(BLOCK_64X64, xd->plane[0].subsampling_x,
  964|  6.07M|                                  xd->plane[0].subsampling_y));
  965|  6.06M|      int mu_blocks_wide = mi_size_wide[max_unit_bsize];
  966|  6.06M|      int mu_blocks_high = mi_size_high[max_unit_bsize];
  967|       |
  968|  6.06M|      mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
  ------------------
  |  |   34|  6.06M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 5.88M, False: 180k]
  |  |  ------------------
  ------------------
  969|  6.06M|      mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
  ------------------
  |  |   34|  6.06M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 5.92M, False: 141k]
  |  |  ------------------
  ------------------
  970|       |
  971|  12.1M|      for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
  ------------------
  |  Branch (971:21): [True: 6.09M, False: 6.06M]
  ------------------
  972|  12.2M|        for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
  ------------------
  |  Branch (972:23): [True: 6.15M, False: 6.09M]
  ------------------
  973|  22.3M|          for (int plane = 0; plane < num_planes; ++plane) {
  ------------------
  |  Branch (973:31): [True: 17.2M, False: 5.05M]
  ------------------
  974|  17.2M|            if (plane && !xd->is_chroma_ref) break;
  ------------------
  |  Branch (974:17): [True: 11.1M, False: 6.12M]
  |  Branch (974:26): [True: 1.09M, False: 10.0M]
  ------------------
  975|  16.1M|            const struct macroblockd_plane *const pd = &xd->plane[plane];
  976|  16.1M|            const int ss_x = pd->subsampling_x;
  977|  16.1M|            const int ss_y = pd->subsampling_y;
  978|  16.1M|            const BLOCK_SIZE plane_bsize =
  979|  16.1M|                get_plane_block_size(bsize, ss_x, ss_y);
  980|  16.1M|            const TX_SIZE max_tx_size =
  981|  16.1M|                get_vartx_max_txsize(xd, plane_bsize, plane);
  982|  16.1M|            const int bh_var_tx = tx_size_high_unit[max_tx_size];
  983|  16.1M|            const int bw_var_tx = tx_size_wide_unit[max_tx_size];
  984|  16.1M|            int block = 0;
  985|  16.1M|            int step =
  986|  16.1M|                tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
  987|  16.1M|            int blk_row, blk_col;
  988|  16.1M|            const int unit_height = ROUND_POWER_OF_TWO(
  ------------------
  |  |   41|  32.3M|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (41:41): [True: 155k, False: 16.0M]
  |  |  ------------------
  ------------------
  989|  16.1M|                AOMMIN(mu_blocks_high + row, max_blocks_high), ss_y);
  990|  16.1M|            const int unit_width = ROUND_POWER_OF_TWO(
  ------------------
  |  |   41|  32.3M|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  |  |  ------------------
  |  |  |  Branch (41:41): [True: 163k, False: 15.9M]
  |  |  ------------------
  ------------------
  991|  16.1M|                AOMMIN(mu_blocks_wide + col, max_blocks_wide), ss_x);
  992|       |
  993|  32.3M|            for (blk_row = row >> ss_y; blk_row < unit_height;
  ------------------
  |  Branch (993:41): [True: 16.2M, False: 16.1M]
  ------------------
  994|  16.2M|                 blk_row += bh_var_tx) {
  995|  33.3M|              for (blk_col = col >> ss_x; blk_col < unit_width;
  ------------------
  |  Branch (995:43): [True: 17.0M, False: 16.2M]
  ------------------
  996|  17.0M|                   blk_col += bw_var_tx) {
  997|  17.0M|                decode_reconstruct_tx(cm, td, r, mbmi, plane, plane_bsize,
  998|  17.0M|                                      blk_row, blk_col, block, max_tx_size,
  999|  17.0M|                                      &eobtotal);
 1000|  17.0M|                block += step;
 1001|  17.0M|              }
 1002|  16.2M|            }
 1003|  16.1M|          }
 1004|  6.15M|        }
 1005|  6.09M|      }
 1006|  6.06M|    }
 1007|  7.90M|    td->cfl_store_inter_block_visit(cm, xd);
 1008|  7.90M|  }
 1009|       |
 1010|  22.8M|  av1_visit_palette(pbi, xd, r, set_color_index_map_offset);
 1011|  22.8M|}
decodeframe.c:set_cb_buffer_offsets:
  275|   204M|                                         TX_SIZE tx_size, int plane) {
  276|   204M|  dcb->cb_offset[plane] += tx_size_wide[tx_size] * tx_size_high[tx_size];
  277|   204M|  dcb->txb_offset[plane] =
  278|   204M|      dcb->cb_offset[plane] / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
  ------------------
  |  |  231|   204M|#define TX_SIZE_W_MIN 4
  ------------------
                    dcb->cb_offset[plane] / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
  ------------------
  |  |  238|   204M|#define TX_SIZE_H_MIN 4
  ------------------
  279|   204M|}
decodeframe.c:decode_reconstruct_tx:
  286|  18.8M|                                         TX_SIZE tx_size, int *eob_total) {
  287|  18.8M|  DecoderCodingBlock *const dcb = &td->dcb;
  288|  18.8M|  MACROBLOCKD *const xd = &dcb->xd;
  289|  18.8M|  const struct macroblockd_plane *const pd = &xd->plane[plane];
  290|  18.8M|  const TX_SIZE plane_tx_size =
  291|  18.8M|      plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x,
  ------------------
  |  Branch (291:7): [True: 10.6M, False: 8.22M]
  ------------------
  292|  10.6M|                                    pd->subsampling_y)
  293|  18.8M|            : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
  294|  8.22M|                                                         blk_col)];
  295|       |  // Scale to match transform block unit.
  296|  18.8M|  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
  297|  18.8M|  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
  298|       |
  299|  18.9M|  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
  ------------------
  |  Branch (299:7): [True: 18.4E, False: 18.9M]
  |  Branch (299:37): [True: 18.4E, False: 18.9M]
  ------------------
  300|       |
  301|  18.8M|  if (tx_size == plane_tx_size || plane) {
  ------------------
  |  Branch (301:7): [True: 17.6M, False: 1.24M]
  |  Branch (301:35): [True: 629k, False: 612k]
  ------------------
  302|  18.3M|    td->read_coeffs_tx_inter_block_visit(cm, dcb, r, plane, blk_row, blk_col,
  303|  18.3M|                                         tx_size);
  304|       |
  305|  18.3M|    td->inverse_tx_inter_block_visit(cm, dcb, r, plane, blk_row, blk_col,
  306|  18.3M|                                     tx_size);
  307|  18.3M|    eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane];
  308|  18.3M|    *eob_total += eob_data->eob;
  309|  18.3M|    set_cb_buffer_offsets(dcb, tx_size, plane);
  310|  18.3M|  } else {
  311|   576k|    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
  312|   576k|    assert(IMPLIES(tx_size <= TX_4X4, sub_txs == tx_size));
  313|   612k|    assert(IMPLIES(tx_size > TX_4X4, sub_txs < tx_size));
  314|   612k|    const int bsw = tx_size_wide_unit[sub_txs];
  315|   612k|    const int bsh = tx_size_high_unit[sub_txs];
  316|   612k|    const int sub_step = bsw * bsh;
  317|   612k|    const int row_end =
  318|   612k|        AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
  ------------------
  |  |   34|   612k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 59.0k, False: 553k]
  |  |  ------------------
  ------------------
  319|   612k|    const int col_end =
  320|   612k|        AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
  ------------------
  |  |   34|   612k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 63.0k, False: 549k]
  |  |  ------------------
  ------------------
  321|       |
  322|   612k|    assert(bsw > 0 && bsh > 0);
  323|       |
  324|  1.66M|    for (int row = 0; row < row_end; row += bsh) {
  ------------------
  |  Branch (324:23): [True: 1.04M, False: 612k]
  ------------------
  325|  1.04M|      const int offsetr = blk_row + row;
  326|  2.86M|      for (int col = 0; col < col_end; col += bsw) {
  ------------------
  |  Branch (326:25): [True: 1.81M, False: 1.04M]
  ------------------
  327|  1.81M|        const int offsetc = blk_col + col;
  328|       |
  329|  1.81M|        decode_reconstruct_tx(cm, td, r, mbmi, plane, plane_bsize, offsetr,
  330|  1.81M|                              offsetc, block, sub_txs, eob_total);
  331|  1.81M|        block += sub_step;
  332|  1.81M|      }
  333|  1.04M|    }
  334|   612k|  }
  335|  18.8M|}
decodeframe.c:set_color_index_map_offset:
  893|   162k|                                              aom_reader *r) {
  894|   162k|  (void)r;
  895|   162k|  Av1ColorMapParam params;
  896|   162k|  const MB_MODE_INFO *const mbmi = xd->mi[0];
  897|   162k|  av1_get_block_dimensions(mbmi->bsize, plane, xd, &params.plane_width,
  898|   162k|                           &params.plane_height, NULL, NULL);
  899|   162k|  xd->color_index_map_offset[plane] += params.plane_width * params.plane_height;
  900|   162k|}
decodeframe.c:decode_block:
 1218|  7.24M|                                PARTITION_TYPE partition, BLOCK_SIZE bsize) {
 1219|  7.24M|  (void)partition;
 1220|  7.24M|  set_offsets_for_pred_and_recon(pbi, td, mi_row, mi_col, bsize);
 1221|  7.24M|  decode_token_recon_block(pbi, td, r, bsize);
 1222|  7.24M|}
decodeframe.c:set_offsets_for_pred_and_recon:
 1188|  7.24M|                                                  BLOCK_SIZE bsize) {
 1189|  7.24M|  AV1_COMMON *const cm = &pbi->common;
 1190|  7.24M|  const CommonModeInfoParams *const mi_params = &cm->mi_params;
 1191|  7.24M|  DecoderCodingBlock *const dcb = &td->dcb;
 1192|  7.24M|  MACROBLOCKD *const xd = &dcb->xd;
 1193|  7.24M|  const int bw = mi_size_wide[bsize];
 1194|  7.24M|  const int bh = mi_size_high[bsize];
 1195|  7.24M|  const int num_planes = av1_num_planes(cm);
 1196|       |
 1197|  7.24M|  const int offset = mi_row * mi_params->mi_stride + mi_col;
 1198|  7.24M|  const TileInfo *const tile = &xd->tile;
 1199|       |
 1200|  7.24M|  xd->mi = mi_params->mi_grid_base + offset;
 1201|  7.24M|  xd->tx_type_map =
 1202|  7.24M|      &mi_params->tx_type_map[mi_row * mi_params->mi_stride + mi_col];
 1203|  7.24M|  xd->tx_type_map_stride = mi_params->mi_stride;
 1204|       |
 1205|  7.24M|  set_plane_n4(xd, bw, bh, num_planes);
 1206|       |
 1207|       |  // Distance of Mb to the various image edges. These are specified to 8th pel
 1208|       |  // as they are always compared to values that are in 1/8th pel units
 1209|  7.24M|  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows,
 1210|  7.24M|                 mi_params->mi_cols);
 1211|       |
 1212|  7.24M|  av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0,
 1213|  7.24M|                       num_planes);
 1214|  7.24M|}
decodeframe.c:loop_restoration_read_sb_coeffs:
 1663|   255k|                                                   int plane, int runit_idx) {
 1664|   255k|  const RestorationInfo *rsi = &cm->rst_info[plane];
 1665|   255k|  RestorationUnitInfo *rui = &rsi->unit_info[runit_idx];
 1666|   255k|  assert(rsi->frame_restoration_type != RESTORE_NONE);
 1667|       |
 1668|   255k|  assert(!cm->features.all_lossless);
 1669|       |
 1670|   255k|  const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN;
  ------------------
  |  |  128|   116k|#define WIENER_WIN_CHROMA (WIENER_WIN - 2)
  |  |  ------------------
  |  |  |  |  121|   116k|#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
  |  |  |  |  ------------------
  |  |  |  |  |  |   43|   116k|#define WIENER_HALFWIN 3
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
                const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN;
  ------------------
  |  |  121|   138k|#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
  |  |  ------------------
  |  |  |  |   43|   138k|#define WIENER_HALFWIN 3
  |  |  ------------------
  ------------------
  |  Branch (1670:26): [True: 116k, False: 138k]
  ------------------
 1671|   255k|  WienerInfo *wiener_info = xd->wiener_info + plane;
 1672|   255k|  SgrprojInfo *sgrproj_info = xd->sgrproj_info + plane;
 1673|       |
 1674|   255k|  if (rsi->frame_restoration_type == RESTORE_SWITCHABLE) {
  ------------------
  |  Branch (1674:7): [True: 75.9k, False: 179k]
  ------------------
 1675|  75.9k|    rui->restoration_type =
 1676|  75.9k|        aom_read_symbol(r, xd->tile_ctx->switchable_restore_cdf,
  ------------------
  |  |   51|  75.9k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1677|  75.9k|                        RESTORE_SWITCHABLE_TYPES, ACCT_STR);
 1678|  75.9k|    switch (rui->restoration_type) {
 1679|  23.3k|      case RESTORE_WIENER:
  ------------------
  |  Branch (1679:7): [True: 23.3k, False: 52.6k]
  ------------------
 1680|  23.3k|        read_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, r);
 1681|  23.3k|        break;
 1682|  25.1k|      case RESTORE_SGRPROJ:
  ------------------
  |  Branch (1682:7): [True: 25.1k, False: 50.8k]
  ------------------
 1683|  25.1k|        read_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, r);
 1684|  25.1k|        break;
 1685|  27.4k|      default: assert(rui->restoration_type == RESTORE_NONE); break;
  ------------------
  |  Branch (1685:7): [True: 27.4k, False: 48.5k]
  ------------------
 1686|  75.9k|    }
 1687|   179k|  } else if (rsi->frame_restoration_type == RESTORE_WIENER) {
  ------------------
  |  Branch (1687:14): [True: 44.6k, False: 134k]
  ------------------
 1688|  44.6k|    if (aom_read_symbol(r, xd->tile_ctx->wiener_restore_cdf, 2, ACCT_STR)) {
  ------------------
  |  |   51|  44.6k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  |  |  |  Branch (51:3): [True: 20.1k, False: 24.5k]
  |  |  ------------------
  ------------------
 1689|  20.1k|      rui->restoration_type = RESTORE_WIENER;
 1690|  20.1k|      read_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, r);
 1691|  24.5k|    } else {
 1692|  24.5k|      rui->restoration_type = RESTORE_NONE;
 1693|  24.5k|    }
 1694|   135k|  } else if (rsi->frame_restoration_type == RESTORE_SGRPROJ) {
  ------------------
  |  Branch (1694:14): [True: 135k, False: 18.4E]
  ------------------
 1695|   135k|    if (aom_read_symbol(r, xd->tile_ctx->sgrproj_restore_cdf, 2, ACCT_STR)) {
  ------------------
  |  |   51|   135k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  |  |  |  Branch (51:3): [True: 81.6k, False: 53.4k]
  |  |  ------------------
  ------------------
 1696|  81.6k|      rui->restoration_type = RESTORE_SGRPROJ;
 1697|  81.6k|      read_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, r);
 1698|  81.6k|    } else {
 1699|  53.4k|      rui->restoration_type = RESTORE_NONE;
 1700|  53.4k|    }
 1701|   135k|  }
 1702|   255k|}
decodeframe.c:read_wiener_filter:
 1565|  43.5k|                                      aom_reader *rb) {
 1566|  43.5k|  memset(wiener_info->vfilter, 0, sizeof(wiener_info->vfilter));
 1567|  43.5k|  memset(wiener_info->hfilter, 0, sizeof(wiener_info->hfilter));
 1568|       |
 1569|  43.5k|  if (wiener_win == WIENER_WIN)
  ------------------
  |  |  121|  43.5k|#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
  |  |  ------------------
  |  |  |  |   43|  43.5k|#define WIENER_HALFWIN 3
  |  |  ------------------
  ------------------
  |  Branch (1569:7): [True: 19.4k, False: 24.1k]
  ------------------
 1570|  19.4k|    wiener_info->vfilter[0] = wiener_info->vfilter[WIENER_WIN - 1] =
  ------------------
  |  |  121|  19.4k|#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
  |  |  ------------------
  |  |  |  |   43|  19.4k|#define WIENER_HALFWIN 3
  |  |  ------------------
  ------------------
 1571|  19.4k|        aom_read_primitive_refsubexpfin(
  ------------------
  |  |   28|  19.4k|  aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1572|  19.4k|            rb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
 1573|  19.4k|            WIENER_FILT_TAP0_SUBEXP_K,
 1574|  19.4k|            ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV, ACCT_STR) +
 1575|  19.4k|        WIENER_FILT_TAP0_MINV;
  ------------------
  |  |  152|  19.4k|  (WIENER_FILT_TAP0_MIDV - (1 << WIENER_FILT_TAP0_BITS) / 2)
  |  |  ------------------
  |  |  |  |  137|  19.4k|#define WIENER_FILT_TAP0_MIDV (3)
  |  |  ------------------
  |  |                 (WIENER_FILT_TAP0_MIDV - (1 << WIENER_FILT_TAP0_BITS) / 2)
  |  |  ------------------
  |  |  |  |  144|  19.4k|#define WIENER_FILT_TAP0_BITS 4
  |  |  ------------------
  ------------------
 1576|  24.1k|  else
 1577|  24.1k|    wiener_info->vfilter[0] = wiener_info->vfilter[WIENER_WIN - 1] = 0;
  ------------------
  |  |  121|  24.1k|#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
  |  |  ------------------
  |  |  |  |   43|  24.1k|#define WIENER_HALFWIN 3
  |  |  ------------------
  ------------------
 1578|  43.5k|  wiener_info->vfilter[1] = wiener_info->vfilter[WIENER_WIN - 2] =
  ------------------
  |  |  121|  43.5k|#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
  |  |  ------------------
  |  |  |  |   43|  43.5k|#define WIENER_HALFWIN 3
  |  |  ------------------
  ------------------
 1579|  43.5k|      aom_read_primitive_refsubexpfin(
  ------------------
  |  |   28|  43.5k|  aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1580|  43.5k|          rb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
 1581|  43.5k|          WIENER_FILT_TAP1_SUBEXP_K,
 1582|  43.5k|          ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV, ACCT_STR) +
 1583|  43.5k|      WIENER_FILT_TAP1_MINV;
  ------------------
  |  |  154|  43.5k|  (WIENER_FILT_TAP1_MIDV - (1 << WIENER_FILT_TAP1_BITS) / 2)
  |  |  ------------------
  |  |  |  |  138|  43.5k|#define WIENER_FILT_TAP1_MIDV (-7)
  |  |  ------------------
  |  |                 (WIENER_FILT_TAP1_MIDV - (1 << WIENER_FILT_TAP1_BITS) / 2)
  |  |  ------------------
  |  |  |  |  145|  43.5k|#define WIENER_FILT_TAP1_BITS 5
  |  |  ------------------
  ------------------
 1584|  43.5k|  wiener_info->vfilter[2] = wiener_info->vfilter[WIENER_WIN - 3] =
  ------------------
  |  |  121|  43.5k|#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
  |  |  ------------------
  |  |  |  |   43|  43.5k|#define WIENER_HALFWIN 3
  |  |  ------------------
  ------------------
 1585|  43.5k|      aom_read_primitive_refsubexpfin(
  ------------------
  |  |   28|  43.5k|  aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1586|  43.5k|          rb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
 1587|  43.5k|          WIENER_FILT_TAP2_SUBEXP_K,
 1588|  43.5k|          ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV, ACCT_STR) +
 1589|  43.5k|      WIENER_FILT_TAP2_MINV;
  ------------------
  |  |  156|  43.5k|  (WIENER_FILT_TAP2_MIDV - (1 << WIENER_FILT_TAP2_BITS) / 2)
  |  |  ------------------
  |  |  |  |  139|  43.5k|#define WIENER_FILT_TAP2_MIDV (15)
  |  |  ------------------
  |  |                 (WIENER_FILT_TAP2_MIDV - (1 << WIENER_FILT_TAP2_BITS) / 2)
  |  |  ------------------
  |  |  |  |  146|  43.5k|#define WIENER_FILT_TAP2_BITS 6
  |  |  ------------------
  ------------------
 1590|       |  // The central element has an implicit +WIENER_FILT_STEP
 1591|  43.5k|  wiener_info->vfilter[WIENER_HALFWIN] =
  ------------------
  |  |   43|  43.5k|#define WIENER_HALFWIN 3
  ------------------
 1592|  43.5k|      -2 * (wiener_info->vfilter[0] + wiener_info->vfilter[1] +
 1593|  43.5k|            wiener_info->vfilter[2]);
 1594|       |
 1595|  43.5k|  if (wiener_win == WIENER_WIN)
  ------------------
  |  |  121|  43.5k|#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
  |  |  ------------------
  |  |  |  |   43|  43.5k|#define WIENER_HALFWIN 3
  |  |  ------------------
  ------------------
  |  Branch (1595:7): [True: 19.4k, False: 24.1k]
  ------------------
 1596|  19.4k|    wiener_info->hfilter[0] = wiener_info->hfilter[WIENER_WIN - 1] =
  ------------------
  |  |  121|  19.4k|#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
  |  |  ------------------
  |  |  |  |   43|  19.4k|#define WIENER_HALFWIN 3
  |  |  ------------------
  ------------------
 1597|  19.4k|        aom_read_primitive_refsubexpfin(
  ------------------
  |  |   28|  19.4k|  aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1598|  19.4k|            rb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
 1599|  19.4k|            WIENER_FILT_TAP0_SUBEXP_K,
 1600|  19.4k|            ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV, ACCT_STR) +
 1601|  19.4k|        WIENER_FILT_TAP0_MINV;
  ------------------
  |  |  152|  19.4k|  (WIENER_FILT_TAP0_MIDV - (1 << WIENER_FILT_TAP0_BITS) / 2)
  |  |  ------------------
  |  |  |  |  137|  19.4k|#define WIENER_FILT_TAP0_MIDV (3)
  |  |  ------------------
  |  |                 (WIENER_FILT_TAP0_MIDV - (1 << WIENER_FILT_TAP0_BITS) / 2)
  |  |  ------------------
  |  |  |  |  144|  19.4k|#define WIENER_FILT_TAP0_BITS 4
  |  |  ------------------
  ------------------
 1602|  24.1k|  else
 1603|  24.1k|    wiener_info->hfilter[0] = wiener_info->hfilter[WIENER_WIN - 1] = 0;
  ------------------
  |  |  121|  24.1k|#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
  |  |  ------------------
  |  |  |  |   43|  24.1k|#define WIENER_HALFWIN 3
  |  |  ------------------
  ------------------
 1604|  43.5k|  wiener_info->hfilter[1] = wiener_info->hfilter[WIENER_WIN - 2] =
  ------------------
  |  |  121|  43.5k|#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
  |  |  ------------------
  |  |  |  |   43|  43.5k|#define WIENER_HALFWIN 3
  |  |  ------------------
  ------------------
 1605|  43.5k|      aom_read_primitive_refsubexpfin(
  ------------------
  |  |   28|  43.5k|  aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1606|  43.5k|          rb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
 1607|  43.5k|          WIENER_FILT_TAP1_SUBEXP_K,
 1608|  43.5k|          ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV, ACCT_STR) +
 1609|  43.5k|      WIENER_FILT_TAP1_MINV;
  ------------------
  |  |  154|  43.5k|  (WIENER_FILT_TAP1_MIDV - (1 << WIENER_FILT_TAP1_BITS) / 2)
  |  |  ------------------
  |  |  |  |  138|  43.5k|#define WIENER_FILT_TAP1_MIDV (-7)
  |  |  ------------------
  |  |                 (WIENER_FILT_TAP1_MIDV - (1 << WIENER_FILT_TAP1_BITS) / 2)
  |  |  ------------------
  |  |  |  |  145|  43.5k|#define WIENER_FILT_TAP1_BITS 5
  |  |  ------------------
  ------------------
 1610|  43.5k|  wiener_info->hfilter[2] = wiener_info->hfilter[WIENER_WIN - 3] =
  ------------------
  |  |  121|  43.5k|#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
  |  |  ------------------
  |  |  |  |   43|  43.5k|#define WIENER_HALFWIN 3
  |  |  ------------------
  ------------------
 1611|  43.5k|      aom_read_primitive_refsubexpfin(
  ------------------
  |  |   28|  43.5k|  aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1612|  43.5k|          rb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
 1613|  43.5k|          WIENER_FILT_TAP2_SUBEXP_K,
 1614|  43.5k|          ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV, ACCT_STR) +
 1615|  43.5k|      WIENER_FILT_TAP2_MINV;
  ------------------
  |  |  156|  43.5k|  (WIENER_FILT_TAP2_MIDV - (1 << WIENER_FILT_TAP2_BITS) / 2)
  |  |  ------------------
  |  |  |  |  139|  43.5k|#define WIENER_FILT_TAP2_MIDV (15)
  |  |  ------------------
  |  |                 (WIENER_FILT_TAP2_MIDV - (1 << WIENER_FILT_TAP2_BITS) / 2)
  |  |  ------------------
  |  |  |  |  146|  43.5k|#define WIENER_FILT_TAP2_BITS 6
  |  |  ------------------
  ------------------
 1616|       |  // The central element has an implicit +WIENER_FILT_STEP
 1617|  43.5k|  wiener_info->hfilter[WIENER_HALFWIN] =
  ------------------
  |  |   43|  43.5k|#define WIENER_HALFWIN 3
  ------------------
 1618|  43.5k|      -2 * (wiener_info->hfilter[0] + wiener_info->hfilter[1] +
 1619|  43.5k|            wiener_info->hfilter[2]);
 1620|  43.5k|  *ref_wiener_info = *wiener_info;
 1621|  43.5k|}
decodeframe.c:read_sgrproj_filter:
 1625|   106k|                                       aom_reader *rb) {
 1626|   106k|  sgrproj_info->ep = aom_read_literal(rb, SGRPROJ_PARAMS_BITS, ACCT_STR);
  ------------------
  |  |   47|   106k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1627|   106k|  const sgr_params_type *params = &av1_sgr_params[sgrproj_info->ep];
 1628|       |
 1629|   106k|  if (params->r[0] == 0) {
  ------------------
  |  Branch (1629:7): [True: 20.5k, False: 86.0k]
  ------------------
 1630|  20.5k|    sgrproj_info->xqd[0] = 0;
 1631|  20.5k|    sgrproj_info->xqd[1] =
 1632|  20.5k|        aom_read_primitive_refsubexpfin(
  ------------------
  |  |   28|  20.5k|  aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1633|  20.5k|            rb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
 1634|  20.5k|            ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, ACCT_STR) +
 1635|  20.5k|        SGRPROJ_PRJ_MIN1;
  ------------------
  |  |  108|  20.5k|#define SGRPROJ_PRJ_MIN1 (-(1 << SGRPROJ_PRJ_BITS) / 4)
  |  |  ------------------
  |  |  |  |   99|  20.5k|#define SGRPROJ_PRJ_BITS 7
  |  |  ------------------
  ------------------
 1636|  86.0k|  } else if (params->r[1] == 0) {
  ------------------
  |  Branch (1636:14): [True: 20.9k, False: 65.0k]
  ------------------
 1637|  20.9k|    sgrproj_info->xqd[0] =
 1638|  20.9k|        aom_read_primitive_refsubexpfin(
  ------------------
  |  |   28|  20.9k|  aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1639|  20.9k|            rb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
 1640|  20.9k|            ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, ACCT_STR) +
 1641|  20.9k|        SGRPROJ_PRJ_MIN0;
  ------------------
  |  |  106|  20.9k|#define SGRPROJ_PRJ_MIN0 (-(1 << SGRPROJ_PRJ_BITS) * 3 / 4)
  |  |  ------------------
  |  |  |  |   99|  20.9k|#define SGRPROJ_PRJ_BITS 7
  |  |  ------------------
  ------------------
 1642|  20.9k|    sgrproj_info->xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - sgrproj_info->xqd[0],
  ------------------
  |  |   99|  20.9k|#define SGRPROJ_PRJ_BITS 7
  ------------------
 1643|  20.9k|                                 SGRPROJ_PRJ_MIN1, SGRPROJ_PRJ_MAX1);
  ------------------
  |  |  108|  20.9k|#define SGRPROJ_PRJ_MIN1 (-(1 << SGRPROJ_PRJ_BITS) / 4)
  |  |  ------------------
  |  |  |  |   99|  20.9k|#define SGRPROJ_PRJ_BITS 7
  |  |  ------------------
  ------------------
                                               SGRPROJ_PRJ_MIN1, SGRPROJ_PRJ_MAX1);
  ------------------
  |  |  109|  20.9k|#define SGRPROJ_PRJ_MAX1 (SGRPROJ_PRJ_MIN1 + (1 << SGRPROJ_PRJ_BITS) - 1)
  |  |  ------------------
  |  |  |  |  108|  20.9k|#define SGRPROJ_PRJ_MIN1 (-(1 << SGRPROJ_PRJ_BITS) / 4)
  |  |  |  |  ------------------
  |  |  |  |  |  |   99|  20.9k|#define SGRPROJ_PRJ_BITS 7
  |  |  |  |  ------------------
  |  |  ------------------
  |  |               #define SGRPROJ_PRJ_MAX1 (SGRPROJ_PRJ_MIN1 + (1 << SGRPROJ_PRJ_BITS) - 1)
  |  |  ------------------
  |  |  |  |   99|  20.9k|#define SGRPROJ_PRJ_BITS 7
  |  |  ------------------
  ------------------
 1644|  65.0k|  } else {
 1645|  65.0k|    sgrproj_info->xqd[0] =
 1646|  65.0k|        aom_read_primitive_refsubexpfin(
  ------------------
  |  |   28|  65.0k|  aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1647|  65.0k|            rb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
 1648|  65.0k|            ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, ACCT_STR) +
 1649|  65.0k|        SGRPROJ_PRJ_MIN0;
  ------------------
  |  |  106|  65.0k|#define SGRPROJ_PRJ_MIN0 (-(1 << SGRPROJ_PRJ_BITS) * 3 / 4)
  |  |  ------------------
  |  |  |  |   99|  65.0k|#define SGRPROJ_PRJ_BITS 7
  |  |  ------------------
  ------------------
 1650|  65.0k|    sgrproj_info->xqd[1] =
 1651|  65.0k|        aom_read_primitive_refsubexpfin(
  ------------------
  |  |   28|  65.0k|  aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1652|  65.0k|            rb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
 1653|  65.0k|            ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, ACCT_STR) +
 1654|  65.0k|        SGRPROJ_PRJ_MIN1;
  ------------------
  |  |  108|  65.0k|#define SGRPROJ_PRJ_MIN1 (-(1 << SGRPROJ_PRJ_BITS) / 4)
  |  |  ------------------
  |  |  |  |   99|  65.0k|#define SGRPROJ_PRJ_BITS 7
  |  |  ------------------
  ------------------
 1655|  65.0k|  }
 1656|       |
 1657|   106k|  *ref_sgrproj_info = *sgrproj_info;
 1658|   106k|}
decodeframe.c:read_partition:
 1226|  10.5M|                                     BLOCK_SIZE bsize) {
 1227|  10.5M|  const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
 1228|  10.5M|  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 1229|       |
 1230|  10.5M|  if (!has_rows && !has_cols) return PARTITION_SPLIT;
  ------------------
  |  Branch (1230:7): [True: 224k, False: 10.3M]
  |  Branch (1230:20): [True: 104k, False: 119k]
  ------------------
 1231|       |
 1232|  10.4M|  assert(ctx >= 0);
 1233|  10.4M|  aom_cdf_prob *partition_cdf = ec_ctx->partition_cdf[ctx];
 1234|  10.4M|  if (has_rows && has_cols) {
  ------------------
  |  Branch (1234:7): [True: 10.3M, False: 118k]
  |  Branch (1234:19): [True: 10.1M, False: 202k]
  ------------------
 1235|  10.1M|    return (PARTITION_TYPE)aom_read_symbol(
  ------------------
  |  |   51|  10.1M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1236|  10.1M|        r, partition_cdf, partition_cdf_length(bsize), ACCT_STR);
 1237|  10.1M|  } else if (!has_rows && has_cols) {
  ------------------
  |  Branch (1237:14): [True: 119k, False: 202k]
  |  Branch (1237:27): [True: 119k, False: 18.4E]
  ------------------
 1238|   119k|    assert(bsize > BLOCK_8X8);
 1239|   119k|    aom_cdf_prob cdf[2];
 1240|   119k|    partition_gather_vert_alike(cdf, partition_cdf, bsize);
 1241|   119k|    assert(cdf[1] == AOM_ICDF(CDF_PROB_TOP));
 1242|   119k|    return aom_read_cdf(r, cdf, 2, ACCT_STR) ? PARTITION_SPLIT : PARTITION_HORZ;
  ------------------
  |  |   49|   119k|  aom_read_cdf_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  |  |  |  Branch (49:3): [True: 48.8k, False: 70.2k]
  |  |  ------------------
  ------------------
 1243|   202k|  } else {
 1244|   202k|    assert(has_rows && !has_cols);
 1245|   203k|    assert(bsize > BLOCK_8X8);
 1246|   203k|    aom_cdf_prob cdf[2];
 1247|   203k|    partition_gather_horz_alike(cdf, partition_cdf, bsize);
 1248|   203k|    assert(cdf[1] == AOM_ICDF(CDF_PROB_TOP));
 1249|   203k|    return aom_read_cdf(r, cdf, 2, ACCT_STR) ? PARTITION_SPLIT : PARTITION_VERT;
  ------------------
  |  |   49|   203k|  aom_read_cdf_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  |  |  |  Branch (49:3): [True: 89.8k, False: 113k]
  |  |  ------------------
  ------------------
 1250|   203k|  }
 1251|  10.4M|}
decodeframe.c:signal_parse_sb_row_done:
 3142|   133k|                                            const int sb_mi_size) {
 3143|   133k|  AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
 3144|   133k|#if CONFIG_MULTITHREAD
 3145|   133k|  pthread_mutex_lock(pbi->row_mt_mutex_);
 3146|   133k|#endif
 3147|   133k|  assert(frame_row_mt_info->mi_rows_parse_done >=
 3148|   133k|         frame_row_mt_info->mi_rows_decode_started);
 3149|   133k|  tile_data->dec_row_mt_sync.mi_rows_parse_done += sb_mi_size;
 3150|   133k|  frame_row_mt_info->mi_rows_parse_done += sb_mi_size;
 3151|   133k|#if CONFIG_MULTITHREAD
 3152|       |  // A new decode job is available. Wake up one worker thread to handle the
 3153|       |  // new decode job.
 3154|       |  // NOTE: This assumes we bump mi_rows_parse_done and mi_rows_decode_started
 3155|       |  // by the same increment (sb_mi_size).
 3156|   133k|  pthread_cond_signal(pbi->row_mt_cond_);
 3157|   133k|  pthread_mutex_unlock(pbi->row_mt_mutex_);
 3158|   133k|#endif
 3159|   133k|}
decodeframe.c:check_trailing_bits_after_symbol_coder:
 2674|   103k|static int check_trailing_bits_after_symbol_coder(aom_reader *r) {
 2675|   103k|  if (aom_reader_has_overflowed(r)) return -1;
  ------------------
  |  Branch (2675:7): [True: 0, False: 103k]
  ------------------
 2676|       |
 2677|   103k|  uint32_t nb_bits = aom_reader_tell(r);
 2678|   103k|  uint32_t nb_bytes = (nb_bits + 7) >> 3;
 2679|   103k|  const uint8_t *p = aom_reader_find_begin(r) + nb_bytes;
 2680|       |
 2681|       |  // aom_reader_tell() returns 1 for a newly initialized decoder, and the
 2682|       |  // return value only increases as values are decoded. So nb_bits > 0, and
 2683|       |  // thus p > p_begin. Therefore accessing p[-1] is safe.
 2684|   103k|  uint8_t last_byte = p[-1];
 2685|   103k|  uint8_t pattern = 128 >> ((nb_bits - 1) & 7);
 2686|   103k|  if ((last_byte & (2 * pattern - 1)) != pattern) return -1;
  ------------------
  |  Branch (2686:7): [True: 7.11k, False: 96.5k]
  ------------------
 2687|       |
 2688|       |  // Make sure that all padding bytes are zero as required by the spec.
 2689|  96.5k|  const uint8_t *p_end = aom_reader_find_end(r);
 2690|   149k|  while (p < p_end) {
  ------------------
  |  Branch (2690:10): [True: 54.0k, False: 95.2k]
  ------------------
 2691|  54.0k|    if (*p != 0) return -1;
  ------------------
  |  Branch (2691:9): [True: 1.32k, False: 52.7k]
  ------------------
 2692|  52.7k|    p++;
 2693|  52.7k|  }
 2694|  95.2k|  return 0;
 2695|  96.5k|}
decodeframe.c:get_next_job_info:
 3038|   262k|                             int *end_of_frame) {
 3039|   262k|  AV1_COMMON *cm = &pbi->common;
 3040|   262k|  TileDataDec *tile_data;
 3041|   262k|  AV1DecRowMTSync *dec_row_mt_sync;
 3042|   262k|  AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
 3043|   262k|  const int tile_rows_start = frame_row_mt_info->tile_rows_start;
 3044|   262k|  const int tile_rows_end = frame_row_mt_info->tile_rows_end;
 3045|   262k|  const int tile_cols_start = frame_row_mt_info->tile_cols_start;
 3046|   262k|  const int tile_cols_end = frame_row_mt_info->tile_cols_end;
 3047|   262k|  const int start_tile = frame_row_mt_info->start_tile;
 3048|   262k|  const int end_tile = frame_row_mt_info->end_tile;
 3049|   262k|  const int sb_mi_size = mi_size_wide[cm->seq_params->sb_size];
 3050|   262k|  int num_mis_to_decode, num_threads_working;
 3051|   262k|  int num_mis_waiting_for_decode;
 3052|   262k|  int min_threads_working = INT_MAX;
 3053|   262k|  int max_mis_to_decode = 0;
 3054|   262k|  int tile_row_idx, tile_col_idx;
 3055|   262k|  int tile_row = -1;
 3056|   262k|  int tile_col = -1;
 3057|       |
 3058|   262k|  memset(next_job_info, 0, sizeof(*next_job_info));
 3059|       |
 3060|       |  // Frame decode is completed or error is encountered.
 3061|   262k|  *end_of_frame = (frame_row_mt_info->mi_rows_decode_started ==
  ------------------
  |  Branch (3061:19): [True: 96.8k, False: 166k]
  ------------------
 3062|   262k|                   frame_row_mt_info->mi_rows_to_decode) ||
 3063|   262k|                  (frame_row_mt_info->row_mt_exit == 1);
  ------------------
  |  Branch (3063:19): [True: 4.96k, False: 161k]
  ------------------
 3064|   262k|  if (*end_of_frame) {
  ------------------
  |  Branch (3064:7): [True: 101k, False: 161k]
  ------------------
 3065|   101k|    return 1;
 3066|   101k|  }
 3067|       |
 3068|       |  // Decoding cannot start as bit-stream parsing is not complete.
 3069|   161k|  assert(frame_row_mt_info->mi_rows_parse_done >=
 3070|   161k|         frame_row_mt_info->mi_rows_decode_started);
 3071|   161k|  if (frame_row_mt_info->mi_rows_parse_done ==
  ------------------
  |  Branch (3071:7): [True: 35.1k, False: 126k]
  ------------------
 3072|   161k|      frame_row_mt_info->mi_rows_decode_started)
 3073|  35.1k|    return 0;
 3074|       |
 3075|       |  // Choose the tile to decode.
 3076|   259k|  for (tile_row_idx = tile_rows_start; tile_row_idx < tile_rows_end;
  ------------------
  |  Branch (3076:40): [True: 133k, False: 126k]
  ------------------
 3077|   133k|       ++tile_row_idx) {
 3078|   301k|    for (tile_col_idx = tile_cols_start; tile_col_idx < tile_cols_end;
  ------------------
  |  Branch (3078:42): [True: 167k, False: 133k]
  ------------------
 3079|   167k|         ++tile_col_idx) {
 3080|   167k|      if (tile_row_idx * cm->tiles.cols + tile_col_idx < start_tile ||
  ------------------
  |  Branch (3080:11): [True: 0, False: 167k]
  ------------------
 3081|   167k|          tile_row_idx * cm->tiles.cols + tile_col_idx > end_tile)
  ------------------
  |  Branch (3081:11): [True: 196, False: 167k]
  ------------------
 3082|    196|        continue;
 3083|       |
 3084|   167k|      tile_data = pbi->tile_data + tile_row_idx * cm->tiles.cols + tile_col_idx;
 3085|   167k|      dec_row_mt_sync = &tile_data->dec_row_mt_sync;
 3086|       |
 3087|   167k|      num_threads_working = dec_row_mt_sync->num_threads_working;
 3088|   167k|      num_mis_waiting_for_decode = (dec_row_mt_sync->mi_rows_parse_done -
 3089|   167k|                                    dec_row_mt_sync->mi_rows_decode_started) *
 3090|   167k|                                   dec_row_mt_sync->mi_cols;
 3091|   167k|      num_mis_to_decode =
 3092|   167k|          (dec_row_mt_sync->mi_rows - dec_row_mt_sync->mi_rows_decode_started) *
 3093|   167k|          dec_row_mt_sync->mi_cols;
 3094|       |
 3095|   167k|      assert(num_mis_to_decode >= num_mis_waiting_for_decode);
 3096|       |
 3097|       |      // Pick the tile which has minimum number of threads working on it.
 3098|   167k|      if (num_mis_waiting_for_decode > 0) {
  ------------------
  |  Branch (3098:11): [True: 130k, False: 37.0k]
  ------------------
 3099|   130k|        if (num_threads_working < min_threads_working) {
  ------------------
  |  Branch (3099:13): [True: 126k, False: 4.01k]
  ------------------
 3100|   126k|          min_threads_working = num_threads_working;
 3101|   126k|          max_mis_to_decode = 0;
 3102|   126k|        }
 3103|   130k|        if (num_threads_working == min_threads_working &&
  ------------------
  |  Branch (3103:13): [True: 130k, False: 250]
  ------------------
 3104|   130k|            num_mis_to_decode > max_mis_to_decode &&
  ------------------
  |  Branch (3104:13): [True: 126k, False: 3.70k]
  ------------------
 3105|   130k|            num_threads_working <
  ------------------
  |  Branch (3105:13): [True: 126k, False: 382]
  ------------------
 3106|   126k|                get_max_row_mt_workers_per_tile(cm, &tile_data->tile_info)) {
 3107|   126k|          max_mis_to_decode = num_mis_to_decode;
 3108|   126k|          tile_row = tile_row_idx;
 3109|   126k|          tile_col = tile_col_idx;
 3110|   126k|        }
 3111|   130k|      }
 3112|   167k|    }
 3113|   133k|  }
 3114|       |  // No job found to process
 3115|   126k|  if (tile_row == -1 || tile_col == -1) return 0;
  ------------------
  |  Branch (3115:7): [True: 85, False: 125k]
  |  Branch (3115:25): [True: 0, False: 125k]
  ------------------
 3116|       |
 3117|   125k|  tile_data = pbi->tile_data + tile_row * cm->tiles.cols + tile_col;
 3118|   125k|  dec_row_mt_sync = &tile_data->dec_row_mt_sync;
 3119|       |
 3120|   125k|  next_job_info->tile_row = tile_row;
 3121|   125k|  next_job_info->tile_col = tile_col;
 3122|   125k|  next_job_info->mi_row = dec_row_mt_sync->mi_rows_decode_started +
 3123|   125k|                          tile_data->tile_info.mi_row_start;
 3124|       |
 3125|   125k|  dec_row_mt_sync->num_threads_working++;
 3126|   125k|  dec_row_mt_sync->mi_rows_decode_started += sb_mi_size;
 3127|   125k|  frame_row_mt_info->mi_rows_decode_started += sb_mi_size;
 3128|   125k|  assert(frame_row_mt_info->mi_rows_parse_done >=
 3129|   125k|         frame_row_mt_info->mi_rows_decode_started);
 3130|   125k|#if CONFIG_MULTITHREAD
 3131|   125k|  if (frame_row_mt_info->mi_rows_decode_started ==
  ------------------
  |  Branch (3131:7): [True: 60.3k, False: 65.5k]
  ------------------
 3132|   125k|      frame_row_mt_info->mi_rows_to_decode) {
 3133|  60.3k|    pthread_cond_broadcast(pbi->row_mt_cond_);
 3134|  60.3k|  }
 3135|   125k|#endif
 3136|       |
 3137|   125k|  return 1;
 3138|   125k|}
decodeframe.c:decode_tile_sb_row:
 2636|   125k|                                      const int mi_row) {
 2637|   125k|  AV1_COMMON *const cm = &pbi->common;
 2638|   125k|  const int num_planes = av1_num_planes(cm);
 2639|   125k|  TileDataDec *const tile_data = pbi->tile_data +
 2640|   125k|                                 tile_info->tile_row * cm->tiles.cols +
 2641|   125k|                                 tile_info->tile_col;
 2642|   125k|  const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info);
 2643|   125k|  const int sb_row_in_tile =
 2644|   125k|      (mi_row - tile_info->mi_row_start) >> cm->seq_params->mib_size_log2;
 2645|   125k|  int sb_col_in_tile = 0;
 2646|   125k|  int row_mt_exit = 0;
 2647|       |
 2648|   577k|  for (int mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
  ------------------
  |  Branch (2648:46): [True: 452k, False: 125k]
  ------------------
 2649|   452k|       mi_col += cm->seq_params->mib_size, sb_col_in_tile++) {
 2650|   452k|    set_cb_buffer(pbi, &td->dcb, pbi->cb_buffer_base, num_planes, mi_row,
 2651|   452k|                  mi_col);
 2652|       |
 2653|   452k|    sync_read(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile);
 2654|       |
 2655|   452k|#if CONFIG_MULTITHREAD
 2656|   452k|    pthread_mutex_lock(pbi->row_mt_mutex_);
 2657|   452k|#endif
 2658|   452k|    row_mt_exit = pbi->frame_row_mt_info.row_mt_exit;
 2659|   452k|#if CONFIG_MULTITHREAD
 2660|   452k|    pthread_mutex_unlock(pbi->row_mt_mutex_);
 2661|   452k|#endif
 2662|       |
 2663|   452k|    if (!row_mt_exit) {
  ------------------
  |  Branch (2663:9): [True: 450k, False: 1.95k]
  ------------------
 2664|       |      // Decoding of the super-block
 2665|   450k|      decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
 2666|   450k|                       cm->seq_params->sb_size, 0x2);
 2667|   450k|    }
 2668|       |
 2669|   452k|    sync_write(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile,
 2670|   452k|               sb_cols_in_tile);
 2671|   452k|  }
 2672|   125k|}
decodeframe.c:sync_read:
 2568|   451k|                             int c) {
 2569|   451k|#if CONFIG_MULTITHREAD
 2570|   451k|  const int nsync = dec_row_mt_sync->sync_range;
 2571|       |
 2572|   451k|  if (r && !(c & (nsync - 1))) {
  ------------------
  |  Branch (2572:7): [True: 297k, False: 154k]
  |  Branch (2572:12): [True: 297k, False: 0]
  ------------------
 2573|   297k|    pthread_mutex_t *const mutex = &dec_row_mt_sync->mutex_[r - 1];
 2574|   297k|    pthread_mutex_lock(mutex);
 2575|       |
 2576|   378k|    while (c > dec_row_mt_sync->cur_sb_col[r - 1] - nsync -
  ------------------
  |  Branch (2576:12): [True: 80.5k, False: 297k]
  ------------------
 2577|   378k|                   dec_row_mt_sync->intrabc_extra_top_right_sb_delay) {
 2578|  80.5k|      pthread_cond_wait(&dec_row_mt_sync->cond_[r - 1], mutex);
 2579|  80.5k|    }
 2580|   297k|    pthread_mutex_unlock(mutex);
 2581|   297k|  }
 2582|       |#else
 2583|       |  (void)dec_row_mt_sync;
 2584|       |  (void)r;
 2585|       |  (void)c;
 2586|       |#endif  // CONFIG_MULTITHREAD
 2587|   451k|}
decodeframe.c:launch_dec_workers:
 3471|  68.0k|                                      int num_workers) {
 3472|  68.0k|  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
 3473|       |
 3474|   193k|  for (int worker_idx = num_workers - 1; worker_idx >= 0; --worker_idx) {
  ------------------
  |  Branch (3474:42): [True: 125k, False: 68.0k]
  ------------------
 3475|   125k|    AVxWorker *const worker = &pbi->tile_workers[worker_idx];
 3476|   125k|    DecWorkerData *const thread_data = (DecWorkerData *)worker->data1;
 3477|       |
 3478|   125k|    thread_data->data_end = data_end;
 3479|       |
 3480|   125k|    worker->had_error = 0;
 3481|   125k|    if (worker_idx == 0) {
  ------------------
  |  Branch (3481:9): [True: 68.0k, False: 57.1k]
  ------------------
 3482|  68.0k|      winterface->execute(worker);
 3483|  68.0k|    } else {
 3484|  57.1k|      winterface->launch(worker);
 3485|  57.1k|    }
 3486|   125k|  }
 3487|  68.0k|}
decodeframe.c:sync_dec_workers:
 3489|  68.0k|static inline void sync_dec_workers(AV1Decoder *pbi, int num_workers) {
 3490|  68.0k|  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
 3491|  68.0k|  int corrupted = 0;
 3492|       |
 3493|   193k|  for (int worker_idx = num_workers; worker_idx > 0; --worker_idx) {
  ------------------
  |  Branch (3493:38): [True: 125k, False: 68.0k]
  ------------------
 3494|   125k|    AVxWorker *const worker = &pbi->tile_workers[worker_idx - 1];
 3495|   125k|    aom_merge_corrupted_flag(&corrupted, !winterface->sync(worker));
 3496|   125k|  }
 3497|       |
 3498|  68.0k|  pbi->dcb.corrupted = corrupted;
 3499|  68.0k|}
decodeframe.c:decode_tile:
 2720|  75.4k|                               int tile_row, int tile_col) {
 2721|  75.4k|  TileInfo tile_info;
 2722|       |
 2723|  75.4k|  AV1_COMMON *const cm = &pbi->common;
 2724|  75.4k|  const int num_planes = av1_num_planes(cm);
 2725|       |
 2726|  75.4k|  av1_tile_set_row(&tile_info, cm, tile_row);
 2727|  75.4k|  av1_tile_set_col(&tile_info, cm, tile_col);
 2728|  75.4k|  DecoderCodingBlock *const dcb = &td->dcb;
 2729|  75.4k|  MACROBLOCKD *const xd = &dcb->xd;
 2730|       |
 2731|  75.4k|  av1_zero_above_context(cm, xd, tile_info.mi_col_start, tile_info.mi_col_end,
 2732|  75.4k|                         tile_row);
 2733|  75.4k|  av1_reset_loop_filter_delta(xd, num_planes);
 2734|  75.4k|  av1_reset_loop_restoration(xd, num_planes);
 2735|       |
 2736|   183k|  for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
  ------------------
  |  Branch (2736:45): [True: 142k, False: 41.4k]
  ------------------
 2737|   142k|       mi_row += cm->seq_params->mib_size) {
 2738|   142k|    av1_zero_left_context(xd);
 2739|       |
 2740|   780k|    for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
  ------------------
  |  Branch (2740:47): [True: 672k, False: 108k]
  ------------------
 2741|   672k|         mi_col += cm->seq_params->mib_size) {
 2742|   672k|      set_cb_buffer(pbi, dcb, &td->cb_buffer_base, num_planes, 0, 0);
 2743|       |
 2744|       |      // Bit-stream parsing and decoding of the superblock
 2745|   672k|      decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
 2746|   672k|                       cm->seq_params->sb_size, 0x3);
 2747|       |
 2748|   672k|      if (aom_reader_has_overflowed(td->bit_reader)) {
  ------------------
  |  Branch (2748:11): [True: 33.9k, False: 638k]
  ------------------
 2749|  33.9k|        aom_merge_corrupted_flag(&dcb->corrupted, 1);
 2750|  33.9k|        return;
 2751|  33.9k|      }
 2752|   672k|    }
 2753|   142k|  }
 2754|       |
 2755|  41.4k|  int corrupted =
 2756|  41.4k|      (check_trailing_bits_after_symbol_coder(td->bit_reader)) ? 1 : 0;
  ------------------
  |  Branch (2756:7): [True: 3.06k, False: 38.3k]
  ------------------
 2757|  41.4k|  aom_merge_corrupted_flag(&dcb->corrupted, corrupted);
 2758|  41.4k|}
decodeframe.c:decode_tiles:
 2762|  79.6k|                                   int end_tile) {
 2763|  79.6k|  AV1_COMMON *const cm = &pbi->common;
 2764|  79.6k|  ThreadData *const td = &pbi->td;
 2765|  79.6k|  CommonTileParams *const tiles = &cm->tiles;
 2766|  79.6k|  const int tile_cols = tiles->cols;
 2767|  79.6k|  const int tile_rows = tiles->rows;
 2768|  79.6k|  const int n_tiles = tile_cols * tile_rows;
 2769|  79.6k|  TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
 2770|  79.6k|  const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
  ------------------
  |  |   34|  79.6k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 79.6k, False: 0]
  |  |  ------------------
  ------------------
 2771|  79.6k|  const int single_row = pbi->dec_tile_row >= 0;
 2772|  79.6k|  const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
  ------------------
  |  |   34|  79.6k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 79.6k, False: 0]
  |  |  ------------------
  ------------------
 2773|  79.6k|  const int single_col = pbi->dec_tile_col >= 0;
 2774|  79.6k|  int tile_rows_start;
 2775|  79.6k|  int tile_rows_end;
 2776|  79.6k|  int tile_cols_start;
 2777|  79.6k|  int tile_cols_end;
 2778|  79.6k|  int inv_col_order;
 2779|  79.6k|  int inv_row_order;
 2780|  79.6k|  int tile_row, tile_col;
 2781|  79.6k|  uint8_t allow_update_cdf;
 2782|  79.6k|  const uint8_t *raw_data_end = NULL;
 2783|       |
 2784|  79.6k|  if (tiles->large_scale) {
  ------------------
  |  Branch (2784:7): [True: 12.9k, False: 66.7k]
  ------------------
 2785|  12.9k|    tile_rows_start = single_row ? dec_tile_row : 0;
  ------------------
  |  Branch (2785:23): [True: 0, False: 12.9k]
  ------------------
 2786|  12.9k|    tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
  ------------------
  |  Branch (2786:21): [True: 0, False: 12.9k]
  ------------------
 2787|  12.9k|    tile_cols_start = single_col ? dec_tile_col : 0;
  ------------------
  |  Branch (2787:23): [True: 0, False: 12.9k]
  ------------------
 2788|  12.9k|    tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
  ------------------
  |  Branch (2788:21): [True: 0, False: 12.9k]
  ------------------
 2789|  12.9k|    inv_col_order = pbi->inv_tile_order && !single_col;
  ------------------
  |  Branch (2789:21): [True: 0, False: 12.9k]
  |  Branch (2789:44): [True: 0, False: 0]
  ------------------
 2790|  12.9k|    inv_row_order = pbi->inv_tile_order && !single_row;
  ------------------
  |  Branch (2790:21): [True: 0, False: 12.9k]
  |  Branch (2790:44): [True: 0, False: 0]
  ------------------
 2791|  12.9k|    allow_update_cdf = 0;
 2792|  66.7k|  } else {
 2793|  66.7k|    tile_rows_start = 0;
 2794|  66.7k|    tile_rows_end = tile_rows;
 2795|  66.7k|    tile_cols_start = 0;
 2796|  66.7k|    tile_cols_end = tile_cols;
 2797|  66.7k|    inv_col_order = pbi->inv_tile_order;
 2798|  66.7k|    inv_row_order = pbi->inv_tile_order;
 2799|  66.7k|    allow_update_cdf = 1;
 2800|  66.7k|  }
 2801|       |
 2802|       |  // No tiles to decode.
 2803|  79.6k|  if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start ||
  ------------------
  |  Branch (2803:7): [True: 0, False: 79.6k]
  |  Branch (2803:43): [True: 0, False: 79.6k]
  ------------------
 2804|       |      // First tile is larger than end_tile.
 2805|  79.6k|      tile_rows_start * tiles->cols + tile_cols_start > end_tile ||
  ------------------
  |  Branch (2805:7): [True: 0, False: 79.6k]
  ------------------
 2806|       |      // Last tile is smaller than start_tile.
 2807|  79.6k|      (tile_rows_end - 1) * tiles->cols + tile_cols_end - 1 < start_tile)
  ------------------
  |  Branch (2807:7): [True: 0, False: 79.6k]
  ------------------
 2808|      0|    return data;
 2809|       |
 2810|  79.6k|  allow_update_cdf = allow_update_cdf && !cm->features.disable_cdf_update;
  ------------------
  |  Branch (2810:22): [True: 66.7k, False: 12.9k]
  |  Branch (2810:42): [True: 60.2k, False: 6.44k]
  ------------------
 2811|       |
 2812|  79.6k|  assert(tile_rows <= MAX_TILE_ROWS);
 2813|  79.6k|  assert(tile_cols <= MAX_TILE_COLS);
 2814|       |
 2815|  79.6k|#if EXT_TILE_DEBUG
 2816|  79.6k|  if (tiles->large_scale && !pbi->ext_tile_debug)
  ------------------
  |  Branch (2816:7): [True: 12.9k, False: 66.7k]
  |  Branch (2816:29): [True: 0, False: 12.9k]
  ------------------
 2817|      0|    raw_data_end = get_ls_single_tile_buffer(pbi, data, tile_buffers);
 2818|  79.6k|  else if (tiles->large_scale && pbi->ext_tile_debug)
  ------------------
  |  Branch (2818:12): [True: 12.9k, False: 66.7k]
  |  Branch (2818:34): [True: 12.9k, False: 0]
  ------------------
 2819|  12.9k|    raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
 2820|  66.7k|  else
 2821|  66.7k|#endif  // EXT_TILE_DEBUG
 2822|  66.7k|    get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile);
 2823|       |
 2824|  79.6k|  if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
  ------------------
  |  Branch (2824:7): [True: 14.3k, False: 65.3k]
  |  Branch (2824:33): [True: 421, False: 64.8k]
  ------------------
 2825|  10.4k|    decoder_alloc_tile_data(pbi, n_tiles);
 2826|  10.4k|  }
 2827|  79.6k|  if (pbi->dcb.xd.seg_mask == NULL)
  ------------------
  |  Branch (2827:7): [True: 10.0k, False: 69.6k]
  ------------------
 2828|  79.6k|    CHECK_MEM_ERROR(cm, pbi->dcb.xd.seg_mask,
  ------------------
  |  |   51|  10.0k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  10.0k|  do {                                                    \
  |  |  |  |   69|  10.0k|    lval = (expr);                                        \
  |  |  |  |   70|  10.0k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 10.0k]
  |  |  |  |  ------------------
  |  |  |  |   71|  10.0k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  10.0k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 2829|  79.6k|                    (uint8_t *)aom_memalign(
 2830|  79.6k|                        16, 2 * MAX_SB_SQUARE * sizeof(*pbi->dcb.xd.seg_mask)));
 2831|       |#if CONFIG_ACCOUNTING
 2832|       |  if (pbi->acct_enabled) {
 2833|       |    aom_accounting_reset(&pbi->accounting);
 2834|       |  }
 2835|       |#endif
 2836|       |
 2837|  79.6k|  set_decode_func_pointers(&pbi->td, 0x3);
 2838|       |
 2839|       |  // Load all tile information into thread_data.
 2840|  79.6k|  td->dcb = pbi->dcb;
 2841|       |
 2842|  79.6k|  td->dcb.corrupted = 0;
 2843|  79.6k|  td->dcb.mc_buf[0] = td->mc_buf[0];
 2844|  79.6k|  td->dcb.mc_buf[1] = td->mc_buf[1];
 2845|  79.6k|  td->dcb.xd.tmp_conv_dst = td->tmp_conv_dst;
 2846|   230k|  for (int j = 0; j < 2; ++j) {
  ------------------
  |  Branch (2846:19): [True: 150k, False: 79.6k]
  ------------------
 2847|   150k|    td->dcb.xd.tmp_obmc_bufs[j] = td->tmp_obmc_bufs[j];
 2848|   150k|  }
 2849|       |
 2850|   155k|  for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
  ------------------
  |  Branch (2850:36): [True: 75.4k, False: 79.6k]
  ------------------
 2851|  75.4k|    const int row = inv_row_order ? tile_rows - 1 - tile_row : tile_row;
  ------------------
  |  Branch (2851:21): [True: 0, False: 75.4k]
  ------------------
 2852|       |
 2853|   151k|    for (tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
  ------------------
  |  Branch (2853:38): [True: 75.6k, False: 75.4k]
  ------------------
 2854|  75.6k|      const int col = inv_col_order ? tile_cols - 1 - tile_col : tile_col;
  ------------------
  |  Branch (2854:23): [True: 0, False: 75.6k]
  ------------------
 2855|  75.6k|      TileDataDec *const tile_data = pbi->tile_data + row * tiles->cols + col;
 2856|  75.6k|      const TileBufferDec *const tile_bs_buf = &tile_buffers[row][col];
 2857|       |
 2858|  75.6k|      if (row * tiles->cols + col < start_tile ||
  ------------------
  |  Branch (2858:11): [True: 0, False: 75.6k]
  ------------------
 2859|  75.6k|          row * tiles->cols + col > end_tile)
  ------------------
  |  Branch (2859:11): [True: 33, False: 75.6k]
  ------------------
 2860|     33|        continue;
 2861|       |
 2862|  75.6k|      td->bit_reader = &tile_data->bit_reader;
 2863|  75.6k|      av1_zero(td->cb_buffer_base.dqcoeff);
  ------------------
  |  |   43|  75.6k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
 2864|  75.6k|      av1_tile_init(&td->dcb.xd.tile, cm, row, col);
 2865|  75.6k|      td->dcb.xd.current_base_qindex = cm->quant_params.base_qindex;
 2866|  75.6k|      setup_bool_decoder(&td->dcb.xd, tile_bs_buf->data, data_end,
 2867|  75.6k|                         tile_bs_buf->size, &pbi->error, td->bit_reader,
 2868|  75.6k|                         allow_update_cdf);
 2869|       |#if CONFIG_ACCOUNTING
 2870|       |      if (pbi->acct_enabled) {
 2871|       |        td->bit_reader->accounting = &pbi->accounting;
 2872|       |        td->bit_reader->accounting->last_tell_frac =
 2873|       |            aom_reader_tell_frac(td->bit_reader);
 2874|       |      } else {
 2875|       |        td->bit_reader->accounting = NULL;
 2876|       |      }
 2877|       |#endif
 2878|  75.6k|      av1_init_macroblockd(cm, &td->dcb.xd);
 2879|  75.6k|      av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), row,
 2880|  75.6k|                             &td->dcb.xd);
 2881|       |
 2882|       |      // Initialise the tile context from the frame context
 2883|  75.6k|      tile_data->tctx = *cm->fc;
 2884|  75.6k|      td->dcb.xd.tile_ctx = &tile_data->tctx;
 2885|       |
 2886|       |      // decode tile
 2887|  75.6k|      decode_tile(pbi, td, row, col);
 2888|  75.6k|      aom_merge_corrupted_flag(&pbi->dcb.corrupted, td->dcb.corrupted);
 2889|  75.6k|      if (pbi->dcb.corrupted)
  ------------------
  |  Branch (2889:11): [True: 37.0k, False: 38.6k]
  ------------------
 2890|  37.0k|        aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
 2891|  37.0k|                           "Failed to decode tile data");
 2892|  75.6k|    }
 2893|  75.4k|  }
 2894|       |
 2895|  79.6k|  if (tiles->large_scale) {
  ------------------
  |  Branch (2895:7): [True: 4.82k, False: 74.8k]
  ------------------
 2896|  4.82k|    if (n_tiles == 1) {
  ------------------
  |  Branch (2896:9): [True: 4.81k, False: 2]
  ------------------
 2897|       |      // Find the end of the single tile buffer
 2898|  4.81k|      return aom_reader_find_end(&pbi->tile_data->bit_reader);
 2899|  4.81k|    }
 2900|       |    // Return the end of the last tile buffer
 2901|      2|    return raw_data_end;
 2902|  4.82k|  }
 2903|  74.8k|  TileDataDec *const tile_data = pbi->tile_data + end_tile;
 2904|       |
 2905|  74.8k|  return aom_reader_find_end(&tile_data->bit_reader);
 2906|  79.6k|}
decodeframe.c:set_planes_to_neutral_grey:
  103|   286k|    int only_chroma) {
  104|   286k|  if (seq_params->use_highbitdepth) {
  ------------------
  |  Branch (104:7): [True: 102k, False: 184k]
  ------------------
  105|   102k|    const int val = 1 << (seq_params->bit_depth - 1);
  106|   400k|    for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
  ------------------
  |  |   36|   400k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (106:35): [True: 298k, False: 102k]
  ------------------
  107|   298k|      const int is_uv = plane > 0;
  108|   298k|      uint16_t *const base = CONVERT_TO_SHORTPTR(buf->buffers[plane]);
  ------------------
  |  |   75|   298k|#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
  ------------------
  109|       |      // Set the first row to neutral grey. Then copy the first row to all
  110|       |      // subsequent rows.
  111|   298k|      if (buf->crop_heights[is_uv] > 0) {
  ------------------
  |  Branch (111:11): [True: 298k, False: 0]
  ------------------
  112|   298k|        aom_memset16(base, val, buf->crop_widths[is_uv]);
  113|   122M|        for (int row_idx = 1; row_idx < buf->crop_heights[is_uv]; row_idx++) {
  ------------------
  |  Branch (113:31): [True: 121M, False: 298k]
  ------------------
  114|   121M|          memcpy(&base[row_idx * buf->strides[is_uv]], base,
  115|   121M|                 sizeof(*base) * buf->crop_widths[is_uv]);
  116|   121M|        }
  117|   298k|      }
  118|   298k|    }
  119|   184k|  } else {
  120|   735k|    for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
  ------------------
  |  |   36|   735k|#define MAX_MB_PLANE 3
  ------------------
  |  Branch (120:35): [True: 551k, False: 184k]
  ------------------
  121|   551k|      const int is_uv = plane > 0;
  122|  65.4M|      for (int row_idx = 0; row_idx < buf->crop_heights[is_uv]; row_idx++) {
  ------------------
  |  Branch (122:29): [True: 64.8M, False: 551k]
  ------------------
  123|  64.8M|        memset(&buf->buffers[plane][row_idx * buf->strides[is_uv]], 1 << 7,
  124|  64.8M|               buf->crop_widths[is_uv]);
  125|  64.8M|      }
  126|   551k|    }
  127|   184k|  }
  128|   286k|}
decodeframe.c:superres_post_decode:
 5164|  30.8k|static inline void superres_post_decode(AV1Decoder *pbi) {
 5165|  30.8k|  AV1_COMMON *const cm = &pbi->common;
 5166|  30.8k|  BufferPool *const pool = cm->buffer_pool;
 5167|       |
 5168|  30.8k|  if (!av1_superres_scaled(cm)) return;
  ------------------
  |  Branch (5168:7): [True: 20.9k, False: 9.88k]
  ------------------
 5169|  9.88k|  assert(!cm->features.all_lossless);
 5170|       |
 5171|  9.88k|  av1_superres_upscale(cm, pool, 0);
 5172|  9.88k|}

av1_neg_deinterleave:
  258|  3.96M|int av1_neg_deinterleave(int diff, int ref, int max) {
  259|  3.96M|  if (!ref) return diff;
  ------------------
  |  Branch (259:7): [True: 913k, False: 3.04M]
  ------------------
  260|  3.04M|  if (ref >= (max - 1)) return max - diff - 1;
  ------------------
  |  Branch (260:7): [True: 351k, False: 2.69M]
  ------------------
  261|  2.69M|  if (2 * ref < max) {
  ------------------
  |  Branch (261:7): [True: 1.60M, False: 1.09M]
  ------------------
  262|  1.60M|    if (diff <= 2 * ref) {
  ------------------
  |  Branch (262:9): [True: 1.39M, False: 204k]
  ------------------
  263|  1.39M|      if (diff & 1)
  ------------------
  |  Branch (263:11): [True: 139k, False: 1.25M]
  ------------------
  264|   139k|        return ref + ((diff + 1) >> 1);
  265|  1.25M|      else
  266|  1.25M|        return ref - (diff >> 1);
  267|  1.39M|    }
  268|   204k|    return diff;
  269|  1.60M|  } else {
  270|  1.09M|    if (diff <= 2 * (max - ref - 1)) {
  ------------------
  |  Branch (270:9): [True: 968k, False: 127k]
  ------------------
  271|   968k|      if (diff & 1)
  ------------------
  |  Branch (271:11): [True: 90.1k, False: 878k]
  ------------------
  272|  90.1k|        return ref + ((diff + 1) >> 1);
  273|   878k|      else
  274|   878k|        return ref - (diff >> 1);
  275|   968k|    }
  276|   127k|    return max - (diff + 1);
  277|  1.09M|  }
  278|  2.69M|}
av1_read_tx_type:
  627|  12.0M|                      int blk_col, TX_SIZE tx_size, aom_reader *r) {
  628|  12.0M|  MB_MODE_INFO *mbmi = xd->mi[0];
  629|  12.0M|  uint8_t *tx_type =
  630|  12.0M|      &xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col];
  631|  12.0M|  *tx_type = DCT_DCT;
  632|       |
  633|       |  // No need to read transform type if block is skipped.
  634|  12.0M|  if (mbmi->skip_txfm ||
  ------------------
  |  Branch (634:7): [True: 18.4E, False: 12.0M]
  ------------------
  635|  12.0M|      segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
  ------------------
  |  Branch (635:7): [True: 18.4E, False: 12.0M]
  ------------------
  636|      0|    return;
  637|       |
  638|       |  // No need to read transform type for lossless mode(qindex==0).
  639|  12.0M|  const int qindex = xd->qindex[mbmi->segment_id];
  640|  12.0M|  if (qindex == 0) return;
  ------------------
  |  Branch (640:7): [True: 2.60M, False: 9.45M]
  ------------------
  641|       |
  642|  9.45M|  const int inter_block = is_inter_block(mbmi);
  643|  9.45M|  if (get_ext_tx_types(tx_size, inter_block, cm->features.reduced_tx_set_used) >
  ------------------
  |  Branch (643:7): [True: 7.70M, False: 1.74M]
  ------------------
  644|  9.45M|      1) {
  645|  7.70M|    const TxSetType tx_set_type = av1_get_ext_tx_set_type(
  646|  7.70M|        tx_size, inter_block, cm->features.reduced_tx_set_used);
  647|  7.70M|    const int eset =
  648|  7.70M|        get_ext_tx_set(tx_size, inter_block, cm->features.reduced_tx_set_used);
  649|       |    // eset == 0 should correspond to a set with only DCT_DCT and
  650|       |    // there is no need to read the tx_type
  651|  7.70M|    assert(eset != 0);
  652|       |
  653|  7.70M|    const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
  654|  7.70M|    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
  655|  7.70M|    if (inter_block) {
  ------------------
  |  Branch (655:9): [True: 3.64M, False: 4.06M]
  ------------------
  656|  3.64M|      *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
  ------------------
  |  |   51|  3.64M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  657|  3.64M|          r, ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
  658|  3.64M|          av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
  659|  4.06M|    } else {
  660|  4.06M|      const PREDICTION_MODE intra_mode =
  661|  4.06M|          mbmi->filter_intra_mode_info.use_filter_intra
  ------------------
  |  Branch (661:11): [True: 856k, False: 3.21M]
  ------------------
  662|  4.06M|              ? fimode_to_intradir[mbmi->filter_intra_mode_info
  663|   856k|                                       .filter_intra_mode]
  664|  4.06M|              : mbmi->mode;
  665|  4.06M|      *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
  ------------------
  |  |   51|  4.06M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  666|  4.06M|          r, ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_mode],
  667|  4.06M|          av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
  668|  4.06M|    }
  669|  7.70M|  }
  670|  9.45M|}
av1_read_mode_info:
 1572|  15.6M|                        aom_reader *r, int x_mis, int y_mis) {
 1573|  15.6M|  AV1_COMMON *const cm = &pbi->common;
 1574|  15.6M|  MACROBLOCKD *const xd = &dcb->xd;
 1575|  15.6M|  MB_MODE_INFO *const mi = xd->mi[0];
 1576|  15.6M|  mi->use_intrabc = 0;
 1577|       |
 1578|  15.6M|  if (frame_is_intra_only(cm)) {
  ------------------
  |  Branch (1578:7): [True: 9.25M, False: 6.36M]
  ------------------
 1579|  9.25M|    read_intra_frame_mode_info(cm, dcb, r);
 1580|  9.25M|    if (cm->seq_params->order_hint_info.enable_ref_frame_mvs)
  ------------------
  |  Branch (1580:9): [True: 4.75M, False: 4.49M]
  ------------------
 1581|  4.75M|      intra_copy_frame_mvs(cm, xd->mi_row, xd->mi_col, x_mis, y_mis);
 1582|  9.25M|  } else {
 1583|  6.36M|    read_inter_frame_mode_info(pbi, dcb, r);
 1584|  6.36M|    if (cm->seq_params->order_hint_info.enable_ref_frame_mvs)
  ------------------
  |  Branch (1584:9): [True: 6.25M, False: 102k]
  ------------------
 1585|  6.25M|      av1_copy_frame_mvs(cm, mi, xd->mi_row, xd->mi_col, x_mis, y_mis);
 1586|  6.36M|  }
 1587|  15.6M|}
decodemv.c:read_intra_frame_mode_info:
  774|  9.25M|                                       DecoderCodingBlock *dcb, aom_reader *r) {
  775|  9.25M|  MACROBLOCKD *const xd = &dcb->xd;
  776|  9.25M|  MB_MODE_INFO *const mbmi = xd->mi[0];
  777|  9.25M|  const MB_MODE_INFO *above_mi = xd->above_mbmi;
  778|  9.25M|  const MB_MODE_INFO *left_mi = xd->left_mbmi;
  779|  9.25M|  const BLOCK_SIZE bsize = mbmi->bsize;
  780|  9.25M|  struct segmentation *const seg = &cm->seg;
  781|       |
  782|  9.25M|  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
  783|       |
  784|  9.25M|  if (seg->segid_preskip)
  ------------------
  |  Branch (784:7): [True: 3.67M, False: 5.57M]
  ------------------
  785|  3.67M|    mbmi->segment_id = read_intra_segment_id(cm, xd, bsize, r, 0);
  786|       |
  787|  9.25M|  mbmi->skip_txfm = read_skip_txfm(cm, xd, mbmi->segment_id, r);
  788|       |
  789|  9.25M|  if (!seg->segid_preskip)
  ------------------
  |  Branch (789:7): [True: 5.57M, False: 3.67M]
  ------------------
  790|  5.57M|    mbmi->segment_id = read_intra_segment_id(cm, xd, bsize, r, mbmi->skip_txfm);
  791|       |
  792|  9.25M|  read_cdef(cm, r, xd);
  793|       |
  794|  9.25M|  read_delta_q_params(cm, xd, r);
  795|       |
  796|  9.25M|  mbmi->current_qindex = xd->current_base_qindex;
  797|       |
  798|  9.25M|  mbmi->ref_frame[0] = INTRA_FRAME;
  799|  9.25M|  mbmi->ref_frame[1] = NONE_FRAME;
  800|  9.25M|  mbmi->palette_mode_info.palette_size[0] = 0;
  801|  9.25M|  mbmi->palette_mode_info.palette_size[1] = 0;
  802|  9.25M|  mbmi->filter_intra_mode_info.use_filter_intra = 0;
  803|       |
  804|  9.25M|  const int mi_row = xd->mi_row;
  805|  9.25M|  const int mi_col = xd->mi_col;
  806|  9.25M|  xd->above_txfm_context = cm->above_contexts.txfm[xd->tile.tile_row] + mi_col;
  807|  9.25M|  xd->left_txfm_context =
  808|  9.25M|      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
  ------------------
  |  |   50|  9.25M|#define MAX_MIB_MASK (MAX_MIB_SIZE - 1)
  |  |  ------------------
  |  |  |  |   44|  9.25M|#define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   43|  9.25M|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   31|  9.25M|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   39|  9.25M|#define MI_SIZE_LOG2 2
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  809|       |
  810|  9.25M|  if (av1_allow_intrabc(cm)) {
  ------------------
  |  Branch (810:7): [True: 1.90M, False: 7.34M]
  ------------------
  811|  1.90M|    read_intrabc_info(cm, dcb, r);
  812|  1.90M|    if (is_intrabc_block(mbmi)) return;
  ------------------
  |  Branch (812:9): [True: 53.4k, False: 1.85M]
  ------------------
  813|  1.90M|  }
  814|       |
  815|  9.19M|  mbmi->mode = read_intra_mode(r, get_y_mode_cdf(ec_ctx, above_mi, left_mi));
  816|       |
  817|  9.19M|  const int use_angle_delta = av1_use_angle_delta(bsize);
  818|  9.19M|  mbmi->angle_delta[PLANE_TYPE_Y] =
  819|  9.19M|      (use_angle_delta && av1_is_directional_mode(mbmi->mode))
  ------------------
  |  Branch (819:8): [True: 8.06M, False: 1.13M]
  |  Branch (819:27): [True: 2.69M, False: 5.36M]
  ------------------
  820|  9.19M|          ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED])
  821|  9.19M|          : 0;
  822|       |
  823|  9.19M|  if (!cm->seq_params->monochrome && xd->is_chroma_ref) {
  ------------------
  |  Branch (823:7): [True: 8.61M, False: 582k]
  |  Branch (823:38): [True: 8.41M, False: 201k]
  ------------------
  824|  8.41M|    mbmi->uv_mode =
  825|  8.41M|        read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode);
  826|  8.41M|    if (mbmi->uv_mode == UV_CFL_PRED) {
  ------------------
  |  Branch (826:9): [True: 1.23M, False: 7.18M]
  ------------------
  827|  1.23M|      mbmi->cfl_alpha_idx = read_cfl_alphas(ec_ctx, r, &mbmi->cfl_alpha_signs);
  828|  1.23M|    }
  829|  8.41M|    const PREDICTION_MODE intra_mode = get_uv_mode(mbmi->uv_mode);
  830|  8.41M|    mbmi->angle_delta[PLANE_TYPE_UV] =
  831|  8.41M|        (use_angle_delta && av1_is_directional_mode(intra_mode))
  ------------------
  |  Branch (831:10): [True: 7.48M, False: 934k]
  |  Branch (831:29): [True: 1.95M, False: 5.52M]
  ------------------
  832|  8.41M|            ? read_angle_delta(r, ec_ctx->angle_delta_cdf[intra_mode - V_PRED])
  833|  8.41M|            : 0;
  834|  8.41M|  } else {
  835|       |    // Avoid decoding angle_info if there is no chroma prediction
  836|   783k|    mbmi->uv_mode = UV_DC_PRED;
  837|   783k|  }
  838|  9.19M|  xd->cfl.store_y = store_cfl_required(cm, xd);
  839|       |
  840|  9.19M|  if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize))
  ------------------
  |  Branch (840:7): [True: 2.28M, False: 6.91M]
  ------------------
  841|  2.28M|    read_palette_mode_info(cm, xd, r);
  842|       |
  843|  9.19M|  read_filter_intra_mode_info(cm, xd, r);
  844|  9.19M|}
decodemv.c:read_intra_segment_id:
  316|  9.25M|                                 aom_reader *r, int skip) {
  317|  9.25M|  struct segmentation *const seg = &cm->seg;
  318|  9.25M|  if (!seg->enabled) return 0;  // Default for disabled segmentation
  ------------------
  |  Branch (318:7): [True: 5.44M, False: 3.80M]
  ------------------
  319|  3.80M|  assert(seg->update_map && !seg->temporal_update);
  320|       |
  321|  3.80M|  const CommonModeInfoParams *const mi_params = &cm->mi_params;
  322|  3.80M|  const int mi_row = xd->mi_row;
  323|  3.80M|  const int mi_col = xd->mi_col;
  324|  3.80M|  const int mi_stride = cm->mi_params.mi_cols;
  325|  3.80M|  const int mi_offset = mi_row * mi_stride + mi_col;
  326|  3.80M|  const int bw = mi_size_wide[bsize];
  327|  3.80M|  const int bh = mi_size_high[bsize];
  328|  3.80M|  const int x_mis = AOMMIN(mi_params->mi_cols - mi_col, bw);
  ------------------
  |  |   34|  3.80M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 44.9k, False: 3.75M]
  |  |  ------------------
  ------------------
  329|  3.80M|  const int y_mis = AOMMIN(mi_params->mi_rows - mi_row, bh);
  ------------------
  |  |   34|  3.80M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 23.8k, False: 3.78M]
  |  |  ------------------
  ------------------
  330|  3.80M|  const int segment_id = read_segment_id(cm, xd, r, skip);
  331|  3.80M|  set_segment_id(cm->cur_frame->seg_map, mi_offset, x_mis, y_mis, mi_stride,
  332|  3.80M|                 segment_id);
  333|  3.80M|  return segment_id;
  334|  3.80M|}
decodemv.c:read_segment_id:
  281|  4.00M|                           aom_reader *r, int skip) {
  282|  4.00M|  int cdf_num;
  283|  4.00M|  const uint8_t pred = av1_get_spatial_seg_pred(cm, xd, &cdf_num, 0);
  284|  4.00M|  if (skip) return pred;
  ------------------
  |  Branch (284:7): [True: 40.7k, False: 3.96M]
  ------------------
  285|       |
  286|  3.96M|  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
  287|  3.96M|  struct segmentation *const seg = &cm->seg;
  288|  3.96M|  struct segmentation_probs *const segp = &ec_ctx->seg;
  289|  3.96M|  aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num];
  290|  3.96M|  const int coded_id = aom_read_symbol(r, pred_cdf, MAX_SEGMENTS, ACCT_STR);
  ------------------
  |  |   51|  3.96M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  291|  3.96M|  const int segment_id =
  292|  3.96M|      av1_neg_deinterleave(coded_id, pred, seg->last_active_segid + 1);
  293|       |
  294|  3.96M|  if (segment_id < 0 || segment_id > seg->last_active_segid) {
  ------------------
  |  Branch (294:7): [True: 18.4E, False: 3.96M]
  |  Branch (294:25): [True: 706, False: 3.96M]
  ------------------
  295|    865|    aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
  296|    865|                       "Corrupted segment_ids");
  297|    865|  }
  298|  3.96M|  return segment_id;
  299|  4.00M|}
decodemv.c:read_skip_txfm:
  447|  15.5M|                          aom_reader *r) {
  448|  15.5M|  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
  ------------------
  |  Branch (448:7): [True: 3.79M, False: 11.7M]
  ------------------
  449|  3.79M|    return 1;
  450|  11.7M|  } else {
  451|  11.7M|    const int ctx = av1_get_skip_txfm_context(xd);
  452|  11.7M|    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
  453|  11.7M|    const int skip_txfm =
  454|  11.7M|        aom_read_symbol(r, ec_ctx->skip_txfm_cdfs[ctx], 2, ACCT_STR);
  ------------------
  |  |   51|  11.7M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  455|  11.7M|    return skip_txfm;
  456|  11.7M|  }
  457|  15.5M|}
decodemv.c:read_cdef:
   40|  15.6M|static void read_cdef(AV1_COMMON *cm, aom_reader *r, MACROBLOCKD *const xd) {
   41|  15.6M|  const int skip_txfm = xd->mi[0]->skip_txfm;
   42|  15.6M|  if (cm->features.coded_lossless) return;
  ------------------
  |  Branch (42:7): [True: 74.3k, False: 15.5M]
  ------------------
   43|  15.5M|  if (cm->features.allow_intrabc) {
  ------------------
  |  Branch (43:7): [True: 1.88M, False: 13.6M]
  ------------------
   44|  1.88M|    assert(cm->cdef_info.cdef_bits == 0);
   45|  1.88M|    return;
   46|  1.88M|  }
   47|       |
   48|       |  // At the start of a superblock, mark that we haven't yet read CDEF strengths
   49|       |  // for any of the CDEF units contained in this superblock.
   50|  13.6M|  const int sb_mask = (cm->seq_params->mib_size - 1);
   51|  13.6M|  const int mi_row_in_sb = (xd->mi_row & sb_mask);
   52|  13.6M|  const int mi_col_in_sb = (xd->mi_col & sb_mask);
   53|  13.6M|  if (mi_row_in_sb == 0 && mi_col_in_sb == 0) {
  ------------------
  |  Branch (53:7): [True: 2.79M, False: 10.8M]
  |  Branch (53:28): [True: 1.05M, False: 1.74M]
  ------------------
   54|  1.05M|    xd->cdef_transmitted[0] = xd->cdef_transmitted[1] =
   55|  1.05M|        xd->cdef_transmitted[2] = xd->cdef_transmitted[3] = false;
   56|  1.05M|  }
   57|       |
   58|       |  // CDEF unit size is 64x64 irrespective of the superblock size.
   59|  13.6M|  const int cdef_size = 1 << (6 - MI_SIZE_LOG2);
  ------------------
  |  |   39|  13.6M|#define MI_SIZE_LOG2 2
  ------------------
   60|       |
   61|       |  // Find index of this CDEF unit in this superblock.
   62|  13.6M|  const int index_mask = cdef_size;
   63|  13.6M|  const int cdef_unit_row_in_sb = ((xd->mi_row & index_mask) != 0);
   64|  13.6M|  const int cdef_unit_col_in_sb = ((xd->mi_col & index_mask) != 0);
   65|  13.6M|  const int index = (cm->seq_params->sb_size == BLOCK_128X128)
  ------------------
  |  Branch (65:21): [True: 9.66M, False: 3.98M]
  ------------------
   66|  13.6M|                        ? cdef_unit_col_in_sb + 2 * cdef_unit_row_in_sb
   67|  13.6M|                        : 0;
   68|       |
   69|       |  // Read CDEF strength from the first non-skip coding block in this CDEF unit.
   70|  13.6M|  if (!xd->cdef_transmitted[index] && !skip_txfm) {
  ------------------
  |  Branch (70:7): [True: 5.98M, False: 7.66M]
  |  Branch (70:39): [True: 909k, False: 5.07M]
  ------------------
   71|       |    // CDEF strength for this CDEF unit needs to be read into the MB_MODE_INFO
   72|       |    // of the 1st block in this CDEF unit.
   73|   909k|    const int first_block_mask = ~(cdef_size - 1);
   74|   909k|    CommonModeInfoParams *const mi_params = &cm->mi_params;
   75|   909k|    const int grid_idx =
   76|   909k|        get_mi_grid_idx(mi_params, xd->mi_row & first_block_mask,
   77|   909k|                        xd->mi_col & first_block_mask);
   78|   909k|    MB_MODE_INFO *const mbmi = mi_params->mi_grid_base[grid_idx];
   79|   909k|    mbmi->cdef_strength =
   80|   909k|        aom_read_literal(r, cm->cdef_info.cdef_bits, ACCT_STR);
  ------------------
  |  |   47|   909k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
   81|   909k|    xd->cdef_transmitted[index] = true;
   82|   909k|  }
   83|  13.6M|}
decodemv.c:read_delta_q_params:
  736|  15.6M|                                aom_reader *r) {
  737|  15.6M|  DeltaQInfo *const delta_q_info = &cm->delta_q_info;
  738|       |
  739|  15.6M|  if (delta_q_info->delta_q_present_flag) {
  ------------------
  |  Branch (739:7): [True: 2.16M, False: 13.4M]
  ------------------
  740|  2.16M|    MB_MODE_INFO *const mbmi = xd->mi[0];
  741|  2.16M|    xd->current_base_qindex +=
  742|  2.16M|        read_delta_qindex(cm, xd, r, mbmi) * delta_q_info->delta_q_res;
  743|       |    /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */
  744|  2.16M|    xd->current_base_qindex = clamp(xd->current_base_qindex, 1, MAXQ);
  ------------------
  |  |   26|  2.16M|#define MAXQ 255
  ------------------
  745|  2.16M|    FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
  746|  2.16M|    if (delta_q_info->delta_lf_present_flag) {
  ------------------
  |  Branch (746:9): [True: 462k, False: 1.70M]
  ------------------
  747|   462k|      const int mi_row = xd->mi_row;
  748|   462k|      const int mi_col = xd->mi_col;
  749|   462k|      if (delta_q_info->delta_lf_multi) {
  ------------------
  |  Branch (749:11): [True: 291k, False: 170k]
  ------------------
  750|   291k|        const int frame_lf_count =
  751|   291k|            av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
  ------------------
  |  |   72|   272k|#define FRAME_LF_COUNT 4
  ------------------
                          av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
  ------------------
  |  |   72|  18.7k|#define FRAME_LF_COUNT 4
  ------------------
  |  Branch (751:13): [True: 272k, False: 18.7k]
  ------------------
  752|  1.41M|        for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
  ------------------
  |  Branch (752:29): [True: 1.12M, False: 291k]
  ------------------
  753|  1.12M|          const int tmp_lvl =
  754|  1.12M|              xd->delta_lf[lf_id] +
  755|  1.12M|              read_delta_lflevel(cm, r, ec_ctx->delta_lf_multi_cdf[lf_id], mbmi,
  756|  1.12M|                                 mi_col, mi_row) *
  757|  1.12M|                  delta_q_info->delta_lf_res;
  758|  1.12M|          mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id] =
  759|  1.12M|              clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
  ------------------
  |  |   27|  1.12M|#define MAX_LOOP_FILTER 63
  ------------------
                            clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
  ------------------
  |  |   27|  1.12M|#define MAX_LOOP_FILTER 63
  ------------------
  760|  1.12M|        }
  761|   291k|      } else {
  762|   170k|        const int tmp_lvl = xd->delta_lf_from_base +
  763|   170k|                            read_delta_lflevel(cm, r, ec_ctx->delta_lf_cdf,
  764|   170k|                                               mbmi, mi_col, mi_row) *
  765|   170k|                                delta_q_info->delta_lf_res;
  766|   170k|        mbmi->delta_lf_from_base = xd->delta_lf_from_base =
  767|   170k|            clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
  ------------------
  |  |   27|   170k|#define MAX_LOOP_FILTER 63
  ------------------
                          clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
  ------------------
  |  |   27|   170k|#define MAX_LOOP_FILTER 63
  ------------------
  768|   170k|      }
  769|   462k|    }
  770|  2.16M|  }
  771|  15.6M|}
decodemv.c:read_delta_qindex:
   86|  2.16M|                             aom_reader *r, MB_MODE_INFO *const mbmi) {
   87|  2.16M|  int sign, abs, reduced_delta_qindex = 0;
   88|  2.16M|  BLOCK_SIZE bsize = mbmi->bsize;
   89|  2.16M|  const int b_col = xd->mi_col & (cm->seq_params->mib_size - 1);
   90|  2.16M|  const int b_row = xd->mi_row & (cm->seq_params->mib_size - 1);
   91|  2.16M|  const int read_delta_q_flag = (b_col == 0 && b_row == 0);
  ------------------
  |  Branch (91:34): [True: 434k, False: 1.73M]
  |  Branch (91:48): [True: 182k, False: 252k]
  ------------------
   92|  2.16M|  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   93|       |
   94|  2.16M|  if ((bsize != cm->seq_params->sb_size || mbmi->skip_txfm == 0) &&
  ------------------
  |  Branch (94:8): [True: 2.06M, False: 99.0k]
  |  Branch (94:44): [True: 16.0k, False: 83.0k]
  ------------------
   95|  2.16M|      read_delta_q_flag) {
  ------------------
  |  Branch (95:7): [True: 99.2k, False: 1.98M]
  ------------------
   96|  99.2k|    abs = aom_read_symbol(r, ec_ctx->delta_q_cdf, DELTA_Q_PROBS + 1, ACCT_STR);
  ------------------
  |  |   51|  99.2k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
   97|  99.2k|    const int smallval = (abs < DELTA_Q_SMALL);
  ------------------
  |  |  497|  99.2k|#define DELTA_Q_SMALL 3
  ------------------
   98|       |
   99|  99.2k|    if (!smallval) {
  ------------------
  |  Branch (99:9): [True: 5.25k, False: 94.0k]
  ------------------
  100|  5.25k|      const int rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1;
  ------------------
  |  |   47|  5.25k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  101|  5.25k|      const int thr = (1 << rem_bits) + 1;
  102|  5.25k|      abs = aom_read_literal(r, rem_bits, ACCT_STR) + thr;
  ------------------
  |  |   47|  5.25k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  103|  5.25k|    }
  104|       |
  105|  99.2k|    if (abs) {
  ------------------
  |  Branch (105:9): [True: 17.6k, False: 81.6k]
  ------------------
  106|  17.6k|      sign = aom_read_bit(r, ACCT_STR);
  ------------------
  |  |   43|  17.6k|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  107|  81.6k|    } else {
  108|  81.6k|      sign = 1;
  109|  81.6k|    }
  110|       |
  111|  99.2k|    reduced_delta_qindex = sign ? -abs : abs;
  ------------------
  |  Branch (111:28): [True: 90.3k, False: 8.90k]
  ------------------
  112|  99.2k|  }
  113|  2.16M|  return reduced_delta_qindex;
  114|  2.16M|}
decodemv.c:read_delta_lflevel:
  118|  1.29M|                              int mi_row) {
  119|  1.29M|  int reduced_delta_lflevel = 0;
  120|  1.29M|  const BLOCK_SIZE bsize = mbmi->bsize;
  121|  1.29M|  const int b_col = mi_col & (cm->seq_params->mib_size - 1);
  122|  1.29M|  const int b_row = mi_row & (cm->seq_params->mib_size - 1);
  123|  1.29M|  const int read_delta_lf_flag = (b_col == 0 && b_row == 0);
  ------------------
  |  Branch (123:35): [True: 430k, False: 868k]
  |  Branch (123:49): [True: 262k, False: 168k]
  ------------------
  124|       |
  125|  1.29M|  if ((bsize != cm->seq_params->sb_size || mbmi->skip_txfm == 0) &&
  ------------------
  |  Branch (125:8): [True: 1.12M, False: 172k]
  |  Branch (125:44): [True: 5.17k, False: 167k]
  ------------------
  126|  1.29M|      read_delta_lf_flag) {
  ------------------
  |  Branch (126:7): [True: 94.9k, False: 1.03M]
  ------------------
  127|  94.9k|    int abs = aom_read_symbol(r, cdf, DELTA_LF_PROBS + 1, ACCT_STR);
  ------------------
  |  |   51|  94.9k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  128|  94.9k|    const int smallval = (abs < DELTA_LF_SMALL);
  ------------------
  |  |  503|  94.9k|#define DELTA_LF_SMALL 3
  ------------------
  129|  94.9k|    if (!smallval) {
  ------------------
  |  Branch (129:9): [True: 6.03k, False: 88.9k]
  ------------------
  130|  6.03k|      const int rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1;
  ------------------
  |  |   47|  6.03k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  131|  6.03k|      const int thr = (1 << rem_bits) + 1;
  132|  6.03k|      abs = aom_read_literal(r, rem_bits, ACCT_STR) + thr;
  ------------------
  |  |   47|  6.03k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  133|  6.03k|    }
  134|  94.9k|    const int sign = abs ? aom_read_bit(r, ACCT_STR) : 1;
  ------------------
  |  |   43|  20.8k|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  |  Branch (134:22): [True: 20.8k, False: 74.1k]
  ------------------
  135|  94.9k|    reduced_delta_lflevel = sign ? -abs : abs;
  ------------------
  |  Branch (135:29): [True: 87.5k, False: 7.39k]
  ------------------
  136|  94.9k|  }
  137|  1.29M|  return reduced_delta_lflevel;
  138|  1.29M|}
decodemv.c:read_intrabc_info:
  694|  1.90M|                              aom_reader *r) {
  695|  1.90M|  MACROBLOCKD *const xd = &dcb->xd;
  696|  1.90M|  MB_MODE_INFO *const mbmi = xd->mi[0];
  697|  1.90M|  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
  698|  1.90M|  mbmi->use_intrabc = aom_read_symbol(r, ec_ctx->intrabc_cdf, 2, ACCT_STR);
  ------------------
  |  |   51|  1.90M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  699|  1.90M|  if (mbmi->use_intrabc) {
  ------------------
  |  Branch (699:7): [True: 60.1k, False: 1.84M]
  ------------------
  700|  60.1k|    BLOCK_SIZE bsize = mbmi->bsize;
  701|  60.1k|    mbmi->mode = DC_PRED;
  702|  60.1k|    mbmi->uv_mode = UV_DC_PRED;
  703|  60.1k|    mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
  704|  60.1k|    mbmi->motion_mode = SIMPLE_TRANSLATION;
  705|       |
  706|  60.1k|    int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
  707|  60.1k|    int_mv ref_mvs[INTRA_FRAME + 1][MAX_MV_REF_CANDIDATES];
  708|       |
  709|  60.1k|    av1_find_mv_refs(cm, xd, mbmi, INTRA_FRAME, dcb->ref_mv_count,
  710|  60.1k|                     xd->ref_mv_stack, xd->weight, ref_mvs, /*global_mvs=*/NULL,
  711|  60.1k|                     inter_mode_ctx);
  712|       |
  713|  60.1k|    int_mv nearestmv, nearmv;
  714|       |
  715|  60.1k|    av1_find_best_ref_mvs(0, ref_mvs[INTRA_FRAME], &nearestmv, &nearmv, 0);
  716|  60.1k|    int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
  ------------------
  |  Branch (716:21): [True: 27.8k, False: 32.2k]
  ------------------
  717|  60.1k|    if (dv_ref.as_int == 0)
  ------------------
  |  Branch (717:9): [True: 27.8k, False: 32.2k]
  ------------------
  718|  27.8k|      av1_find_ref_dv(&dv_ref, &xd->tile, cm->seq_params->mib_size, xd->mi_row);
  719|       |    // Ref DV should not have sub-pel.
  720|  60.1k|    int valid_dv = (dv_ref.as_mv.col & 7) == 0 && (dv_ref.as_mv.row & 7) == 0;
  ------------------
  |  Branch (720:20): [True: 60.1k, False: 0]
  |  Branch (720:51): [True: 60.1k, False: 4]
  ------------------
  721|  60.1k|    dv_ref.as_mv.col = (dv_ref.as_mv.col >> 3) * 8;
  722|  60.1k|    dv_ref.as_mv.row = (dv_ref.as_mv.row >> 3) * 8;
  723|  60.1k|    valid_dv = valid_dv && assign_dv(cm, xd, &mbmi->mv[0], &dv_ref, xd->mi_row,
  ------------------
  |  Branch (723:16): [True: 60.1k, False: 4]
  |  Branch (723:28): [True: 53.4k, False: 6.71k]
  ------------------
  724|  60.1k|                                     xd->mi_col, bsize, r);
  725|  60.1k|    if (!valid_dv) {
  ------------------
  |  Branch (725:9): [True: 6.70k, False: 53.4k]
  ------------------
  726|       |      // Intra bc motion vectors are not valid - signal corrupt frame
  727|  6.70k|      aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
  728|  6.70k|                         "Invalid intrabc dv");
  729|  6.70k|    }
  730|  60.1k|  }
  731|  1.90M|}
decodemv.c:assign_dv:
  679|  60.1k|                            BLOCK_SIZE bsize, aom_reader *r) {
  680|  60.1k|  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
  681|  60.1k|  read_mv(r, &mv->as_mv, &ref_mv->as_mv, &ec_ctx->ndvc, MV_SUBPEL_NONE);
  682|       |  // DV should not have sub-pel.
  683|  60.1k|  assert((mv->as_mv.col & 7) == 0);
  684|  60.1k|  assert((mv->as_mv.row & 7) == 0);
  685|  60.1k|  mv->as_mv.col = (mv->as_mv.col >> 3) * 8;
  686|  60.1k|  mv->as_mv.row = (mv->as_mv.row >> 3) * 8;
  687|  60.1k|  int valid = is_mv_valid(&mv->as_mv) &&
  ------------------
  |  Branch (687:15): [True: 60.0k, False: 40]
  ------------------
  688|  60.1k|              av1_is_dv_valid(mv->as_mv, cm, xd, mi_row, mi_col, bsize,
  ------------------
  |  Branch (688:15): [True: 53.4k, False: 6.67k]
  ------------------
  689|  60.0k|                              cm->seq_params->mib_size_log2);
  690|  60.1k|  return valid;
  691|  60.1k|}
decodemv.c:read_mv:
  887|  2.25M|                           nmv_context *ctx, MvSubpelPrecision precision) {
  888|  2.25M|  MV diff = kZeroMv;
  889|  2.25M|  const MV_JOINT_TYPE joint_type =
  890|  2.25M|      (MV_JOINT_TYPE)aom_read_symbol(r, ctx->joints_cdf, MV_JOINTS, ACCT_STR);
  ------------------
  |  |   51|  2.25M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  891|       |
  892|  2.25M|  if (mv_joint_vertical(joint_type))
  ------------------
  |  Branch (892:7): [True: 1.26M, False: 990k]
  ------------------
  893|  1.26M|    diff.row = read_mv_component(r, &ctx->comps[0], precision > MV_SUBPEL_NONE,
  894|  1.26M|                                 precision > MV_SUBPEL_LOW_PRECISION);
  895|       |
  896|  2.25M|  if (mv_joint_horizontal(joint_type))
  ------------------
  |  Branch (896:7): [True: 1.12M, False: 1.12M]
  ------------------
  897|  1.12M|    diff.col = read_mv_component(r, &ctx->comps[1], precision > MV_SUBPEL_NONE,
  898|  1.12M|                                 precision > MV_SUBPEL_LOW_PRECISION);
  899|       |
  900|  2.25M|  mv->row = ref->row + diff.row;
  901|  2.25M|  mv->col = ref->col + diff.col;
  902|  2.25M|}
decodemv.c:read_mv_component:
  847|  2.38M|                             int use_subpel, int usehp) {
  848|  2.38M|  int mag, d, fr, hp;
  849|  2.38M|  const int sign = aom_read_symbol(r, mvcomp->sign_cdf, 2, ACCT_STR);
  ------------------
  |  |   51|  2.38M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  850|  2.38M|  const int mv_class =
  851|  2.38M|      aom_read_symbol(r, mvcomp->classes_cdf, MV_CLASSES, ACCT_STR);
  ------------------
  |  |   51|  2.38M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  852|  2.38M|  const int class0 = mv_class == MV_CLASS_0;
  853|       |
  854|       |  // Integer part
  855|  2.38M|  if (class0) {
  ------------------
  |  Branch (855:7): [True: 1.95M, False: 437k]
  ------------------
  856|  1.95M|    d = aom_read_symbol(r, mvcomp->class0_cdf, CLASS0_SIZE, ACCT_STR);
  ------------------
  |  |   51|  1.95M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  857|  1.95M|    mag = 0;
  858|  1.95M|  } else {
  859|   437k|    const int n = mv_class + CLASS0_BITS - 1;  // number of bits
  ------------------
  |  |   64|   437k|#define CLASS0_BITS 1 /* bits at integer precision for class 0 */
  ------------------
  860|   437k|    d = 0;
  861|  2.78M|    for (int i = 0; i < n; ++i)
  ------------------
  |  Branch (861:21): [True: 2.35M, False: 437k]
  ------------------
  862|  2.35M|      d |= aom_read_symbol(r, mvcomp->bits_cdf[i], 2, ACCT_STR) << i;
  ------------------
  |  |   51|  2.35M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  863|   437k|    mag = CLASS0_SIZE << (mv_class + 2);
  ------------------
  |  |   65|   437k|#define CLASS0_SIZE (1 << CLASS0_BITS)
  |  |  ------------------
  |  |  |  |   64|   437k|#define CLASS0_BITS 1 /* bits at integer precision for class 0 */
  |  |  ------------------
  ------------------
  864|   437k|  }
  865|       |
  866|  2.38M|  if (use_subpel) {
  ------------------
  |  Branch (866:7): [True: 2.19M, False: 191k]
  ------------------
  867|       |    // Fractional part
  868|  2.19M|    fr = aom_read_symbol(r, class0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
  ------------------
  |  |   51|  4.39M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  |  |  |  Branch (51:23): [True: 1.78M, False: 408k]
  |  |  ------------------
  ------------------
  869|  2.19M|                         MV_FP_SIZE, ACCT_STR);
  870|       |
  871|       |    // High precision part (if hp is not used, the default value of the hp is 1)
  872|  2.19M|    hp = usehp ? aom_read_symbol(
  ------------------
  |  |   51|   426k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  |  |  |  Branch (51:23): [True: 136k, False: 77.0k]
  |  |  ------------------
  ------------------
  |  Branch (872:10): [True: 213k, False: 1.98M]
  ------------------
  873|  2.19M|                     r, class0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf, 2,
  874|  2.19M|                     ACCT_STR)
  875|  2.19M|               : 1;
  876|  2.19M|  } else {
  877|   191k|    fr = 3;
  878|   191k|    hp = 1;
  879|   191k|  }
  880|       |
  881|       |  // Result
  882|  2.38M|  mag += ((d << 3) | (fr << 1) | hp) + 1;
  883|  2.38M|  return sign ? -mag : mag;
  ------------------
  |  Branch (883:10): [True: 1.12M, False: 1.26M]
  ------------------
  884|  2.38M|}
decodemv.c:is_mv_valid:
 1109|  5.22M|static inline int is_mv_valid(const MV *mv) {
 1110|  5.22M|  return mv->row > MV_LOW && mv->row < MV_UPP && mv->col > MV_LOW &&
  ------------------
  |  |   76|  10.4M|#define MV_LOW (-(1 << MV_IN_USE_BITS))
  |  |  ------------------
  |  |  |  |   74|  5.22M|#define MV_IN_USE_BITS 14
  |  |  ------------------
  ------------------
                return mv->row > MV_LOW && mv->row < MV_UPP && mv->col > MV_LOW &&
  ------------------
  |  |   75|  10.4M|#define MV_UPP (1 << MV_IN_USE_BITS)
  |  |  ------------------
  |  |  |  |   74|  5.20M|#define MV_IN_USE_BITS 14
  |  |  ------------------
  ------------------
                return mv->row > MV_LOW && mv->row < MV_UPP && mv->col > MV_LOW &&
  ------------------
  |  |   76|  10.4M|#define MV_LOW (-(1 << MV_IN_USE_BITS))
  |  |  ------------------
  |  |  |  |   74|  5.19M|#define MV_IN_USE_BITS 14
  |  |  ------------------
  ------------------
  |  Branch (1110:10): [True: 5.20M, False: 26.7k]
  |  Branch (1110:30): [True: 5.19M, False: 3.48k]
  |  Branch (1110:50): [True: 5.18M, False: 16.6k]
  ------------------
 1111|  5.22M|         mv->col < MV_UPP;
  ------------------
  |  |   75|  5.18M|#define MV_UPP (1 << MV_IN_USE_BITS)
  |  |  ------------------
  |  |  |  |   74|  5.18M|#define MV_IN_USE_BITS 14
  |  |  ------------------
  ------------------
  |  Branch (1111:10): [True: 5.15M, False: 20.8k]
  ------------------
 1112|  5.22M|}
decodemv.c:read_intra_mode:
   36|  10.9M|static PREDICTION_MODE read_intra_mode(aom_reader *r, aom_cdf_prob *cdf) {
   37|  10.9M|  return (PREDICTION_MODE)aom_read_symbol(r, cdf, INTRA_MODES, ACCT_STR);
  ------------------
  |  |   51|  10.9M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
   38|  10.9M|}
decodemv.c:read_angle_delta:
  603|  5.25M|static int read_angle_delta(aom_reader *r, aom_cdf_prob *cdf) {
  604|  5.25M|  const int sym = aom_read_symbol(r, cdf, 2 * MAX_ANGLE_DELTA + 1, ACCT_STR);
  ------------------
  |  |   51|  5.25M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  605|  5.25M|  return sym - MAX_ANGLE_DELTA;
  ------------------
  |  |  467|  5.25M|#define MAX_ANGLE_DELTA 3
  ------------------
  606|  5.25M|}
decodemv.c:read_intra_mode_uv:
  143|  9.87M|                                             PREDICTION_MODE y_mode) {
  144|  9.87M|  const UV_PREDICTION_MODE uv_mode =
  145|  9.87M|      aom_read_symbol(r, ec_ctx->uv_mode_cdf[cfl_allowed][y_mode],
  ------------------
  |  |   51|  9.87M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  146|  9.87M|                      UV_INTRA_MODES - !cfl_allowed, ACCT_STR);
  147|  9.87M|  return uv_mode;
  148|  9.87M|}
decodemv.c:read_cfl_alphas:
  151|  1.54M|                               int8_t *signs_out) {
  152|  1.54M|  const int8_t joint_sign =
  153|  1.54M|      aom_read_symbol(r, ec_ctx->cfl_sign_cdf, CFL_JOINT_SIGNS, "cfl:signs");
  ------------------
  |  |   51|  1.54M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  154|  1.54M|  uint8_t idx = 0;
  155|       |  // Magnitudes are only coded for nonzero values
  156|  1.54M|  if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) {
  ------------------
  |  |  281|  1.54M|#define CFL_SIGN_U(js) (((js + 1) * 11) >> 5)
  ------------------
  |  Branch (156:7): [True: 1.42M, False: 117k]
  ------------------
  157|  1.42M|    aom_cdf_prob *cdf_u = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
  ------------------
  |  |  288|  1.42M|#define CFL_CONTEXT_U(js) (js + 1 - CFL_SIGNS)
  ------------------
  158|  1.42M|    idx = (uint8_t)aom_read_symbol(r, cdf_u, CFL_ALPHABET_SIZE, "cfl:alpha_u")
  ------------------
  |  |   51|  1.42M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  159|  1.42M|          << CFL_ALPHABET_SIZE_LOG2;
  ------------------
  |  |  256|  1.42M|#define CFL_ALPHABET_SIZE_LOG2 4
  ------------------
  160|  1.42M|  }
  161|  1.54M|  if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) {
  ------------------
  |  |  283|  1.54M|#define CFL_SIGN_V(js) ((js + 1) - CFL_SIGNS * CFL_SIGN_U(js))
  |  |  ------------------
  |  |  |  |  281|  1.54M|#define CFL_SIGN_U(js) (((js + 1) * 11) >> 5)
  |  |  ------------------
  ------------------
  |  Branch (161:7): [True: 1.10M, False: 443k]
  ------------------
  162|  1.10M|    aom_cdf_prob *cdf_v = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
  ------------------
  |  |  291|  1.10M|  (CFL_SIGN_V(js) * CFL_SIGNS + CFL_SIGN_U(js) - CFL_SIGNS)
  |  |  ------------------
  |  |  |  |  283|  1.10M|#define CFL_SIGN_V(js) ((js + 1) - CFL_SIGNS * CFL_SIGN_U(js))
  |  |  |  |  ------------------
  |  |  |  |  |  |  281|  1.10M|#define CFL_SIGN_U(js) (((js + 1) * 11) >> 5)
  |  |  |  |  ------------------
  |  |  ------------------
  |  |                 (CFL_SIGN_V(js) * CFL_SIGNS + CFL_SIGN_U(js) - CFL_SIGNS)
  |  |  ------------------
  |  |  |  |  281|  1.10M|#define CFL_SIGN_U(js) (((js + 1) * 11) >> 5)
  |  |  ------------------
  ------------------
  163|  1.10M|    idx += (uint8_t)aom_read_symbol(r, cdf_v, CFL_ALPHABET_SIZE, "cfl:alpha_v");
  ------------------
  |  |   51|  1.10M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  164|  1.10M|  }
  165|  1.54M|  *signs_out = joint_sign;
  166|  1.54M|  return idx;
  167|  1.54M|}
decodemv.c:read_palette_mode_info:
  568|  2.32M|                                   aom_reader *r) {
  569|  2.32M|  const int num_planes = av1_num_planes(cm);
  570|  2.32M|  MB_MODE_INFO *const mbmi = xd->mi[0];
  571|  2.32M|  const BLOCK_SIZE bsize = mbmi->bsize;
  572|  2.32M|  assert(av1_allow_palette(cm->features.allow_screen_content_tools, bsize));
  573|  2.32M|  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
  574|  2.32M|  const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
  575|       |
  576|  2.32M|  if (mbmi->mode == DC_PRED) {
  ------------------
  |  Branch (576:7): [True: 744k, False: 1.58M]
  ------------------
  577|   744k|    const int palette_mode_ctx = av1_get_palette_mode_ctx(xd);
  578|   744k|    const int modev = aom_read_symbol(
  ------------------
  |  |   51|   744k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  579|   744k|        r, xd->tile_ctx->palette_y_mode_cdf[bsize_ctx][palette_mode_ctx], 2,
  580|   744k|        ACCT_STR);
  581|   744k|    if (modev) {
  ------------------
  |  Branch (581:9): [True: 88.6k, False: 655k]
  ------------------
  582|  88.6k|      pmi->palette_size[0] =
  583|  88.6k|          aom_read_symbol(r, xd->tile_ctx->palette_y_size_cdf[bsize_ctx],
  ------------------
  |  |   51|  88.6k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  584|  88.6k|                          PALETTE_SIZES, ACCT_STR) +
  585|  88.6k|          2;
  586|  88.6k|      read_palette_colors_y(xd, cm->seq_params->bit_depth, pmi, r);
  587|  88.6k|    }
  588|   744k|  }
  589|  2.32M|  if (num_planes > 1 && mbmi->uv_mode == UV_DC_PRED && xd->is_chroma_ref) {
  ------------------
  |  Branch (589:7): [True: 2.28M, False: 37.9k]
  |  Branch (589:25): [True: 527k, False: 1.76M]
  |  Branch (589:56): [True: 514k, False: 12.8k]
  ------------------
  590|   514k|    const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
  591|   514k|    const int modev = aom_read_symbol(
  ------------------
  |  |   51|   514k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  592|   514k|        r, xd->tile_ctx->palette_uv_mode_cdf[palette_uv_mode_ctx], 2, ACCT_STR);
  593|   514k|    if (modev) {
  ------------------
  |  Branch (593:9): [True: 73.1k, False: 441k]
  ------------------
  594|  73.1k|      pmi->palette_size[1] =
  595|  73.1k|          aom_read_symbol(r, xd->tile_ctx->palette_uv_size_cdf[bsize_ctx],
  ------------------
  |  |   51|  73.1k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  596|  73.1k|                          PALETTE_SIZES, ACCT_STR) +
  597|  73.1k|          2;
  598|  73.1k|      read_palette_colors_uv(xd, cm->seq_params->bit_depth, pmi, r);
  599|  73.1k|    }
  600|   514k|  }
  601|  2.32M|}
decodemv.c:read_palette_colors_y:
  479|  88.6k|                                  PALETTE_MODE_INFO *const pmi, aom_reader *r) {
  480|  88.6k|  uint16_t color_cache[2 * PALETTE_MAX_SIZE];
  481|  88.6k|  uint16_t cached_colors[PALETTE_MAX_SIZE];
  482|  88.6k|  const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
  483|  88.6k|  const int n = pmi->palette_size[0];
  484|  88.6k|  int idx = 0;
  485|   383k|  for (int i = 0; i < n_cache && idx < n; ++i)
  ------------------
  |  Branch (485:19): [True: 304k, False: 79.0k]
  |  Branch (485:34): [True: 294k, False: 9.53k]
  ------------------
  486|   294k|    if (aom_read_bit(r, ACCT_STR)) cached_colors[idx++] = color_cache[i];
  ------------------
  |  |   43|   294k|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  |  |  |  Branch (43:3): [True: 150k, False: 144k]
  |  |  ------------------
  ------------------
  487|  88.6k|  if (idx < n) {
  ------------------
  |  Branch (487:7): [True: 70.2k, False: 18.3k]
  ------------------
  488|  70.2k|    const int n_cached_colors = idx;
  489|  70.2k|    pmi->palette_colors[idx++] = aom_read_literal(r, bit_depth, ACCT_STR);
  ------------------
  |  |   47|  70.2k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  490|  70.2k|    if (idx < n) {
  ------------------
  |  Branch (490:9): [True: 62.4k, False: 7.77k]
  ------------------
  491|  62.4k|      const int min_bits = bit_depth - 3;
  492|  62.4k|      int bits = min_bits + aom_read_literal(r, 2, ACCT_STR);
  ------------------
  |  |   47|  62.4k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  493|  62.4k|      int range = (1 << bit_depth) - pmi->palette_colors[idx - 1] - 1;
  494|   246k|      for (; idx < n; ++idx) {
  ------------------
  |  Branch (494:14): [True: 184k, False: 62.4k]
  ------------------
  495|   184k|        assert(range >= 0);
  496|   184k|        const int delta = aom_read_literal(r, bits, ACCT_STR) + 1;
  ------------------
  |  |   47|   184k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  497|   184k|        pmi->palette_colors[idx] = clamp(pmi->palette_colors[idx - 1] + delta,
  498|   184k|                                         0, (1 << bit_depth) - 1);
  499|   184k|        range -= (pmi->palette_colors[idx] - pmi->palette_colors[idx - 1]);
  500|   184k|        bits = AOMMIN(bits, aom_ceil_log2(range));
  ------------------
  |  |   34|   184k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 55.6k, False: 128k]
  |  |  ------------------
  ------------------
  501|   184k|      }
  502|  62.4k|    }
  503|  70.2k|    merge_colors(pmi->palette_colors, cached_colors, n, n_cached_colors);
  504|  70.2k|  } else {
  505|  18.3k|    memcpy(pmi->palette_colors, cached_colors, n * sizeof(cached_colors[0]));
  506|  18.3k|  }
  507|  88.6k|}
decodemv.c:merge_colors:
  463|   136k|                         int n_colors, int n_cached_colors) {
  464|   136k|  if (n_cached_colors == 0) return;
  ------------------
  |  Branch (464:7): [True: 90.1k, False: 46.7k]
  ------------------
  465|  46.7k|  int cache_idx = 0, trans_idx = n_cached_colors;
  466|   292k|  for (int i = 0; i < n_colors; ++i) {
  ------------------
  |  Branch (466:19): [True: 246k, False: 46.7k]
  ------------------
  467|   246k|    if (cache_idx < n_cached_colors &&
  ------------------
  |  Branch (467:9): [True: 177k, False: 68.5k]
  ------------------
  468|   246k|        (trans_idx >= n_colors ||
  ------------------
  |  Branch (468:10): [True: 34.9k, False: 142k]
  ------------------
  469|   177k|         cached_colors[cache_idx] <= colors[trans_idx])) {
  ------------------
  |  Branch (469:10): [True: 67.7k, False: 74.7k]
  ------------------
  470|   102k|      colors[i] = cached_colors[cache_idx++];
  471|   143k|    } else {
  472|   143k|      assert(trans_idx < n_colors);
  473|   143k|      colors[i] = colors[trans_idx++];
  474|   143k|    }
  475|   246k|  }
  476|  46.7k|}
decodemv.c:read_palette_colors_uv:
  511|  73.1k|                                   aom_reader *r) {
  512|  73.1k|  const int n = pmi->palette_size[1];
  513|       |  // U channel colors.
  514|  73.1k|  uint16_t color_cache[2 * PALETTE_MAX_SIZE];
  515|  73.1k|  uint16_t cached_colors[PALETTE_MAX_SIZE];
  516|  73.1k|  const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
  517|  73.1k|  int idx = 0;
  518|   124k|  for (int i = 0; i < n_cache && idx < n; ++i)
  ------------------
  |  Branch (518:19): [True: 57.6k, False: 66.8k]
  |  Branch (518:34): [True: 51.3k, False: 6.26k]
  ------------------
  519|  51.3k|    if (aom_read_bit(r, ACCT_STR)) cached_colors[idx++] = color_cache[i];
  ------------------
  |  |   43|  51.3k|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  |  |  |  Branch (43:3): [True: 28.7k, False: 22.6k]
  |  |  ------------------
  ------------------
  520|  73.1k|  if (idx < n) {
  ------------------
  |  Branch (520:7): [True: 66.6k, False: 6.43k]
  ------------------
  521|  66.6k|    const int n_cached_colors = idx;
  522|  66.6k|    idx += PALETTE_MAX_SIZE;
  ------------------
  |  |   63|  66.6k|#define PALETTE_MAX_SIZE 8
  ------------------
  523|  66.6k|    pmi->palette_colors[idx++] = aom_read_literal(r, bit_depth, ACCT_STR);
  ------------------
  |  |   47|  66.6k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  524|  66.6k|    if (idx < PALETTE_MAX_SIZE + n) {
  ------------------
  |  |   63|  66.6k|#define PALETTE_MAX_SIZE 8
  ------------------
  |  Branch (524:9): [True: 63.8k, False: 2.87k]
  ------------------
  525|  63.8k|      const int min_bits = bit_depth - 3;
  526|  63.8k|      int bits = min_bits + aom_read_literal(r, 2, ACCT_STR);
  ------------------
  |  |   47|  63.8k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  527|  63.8k|      int range = (1 << bit_depth) - pmi->palette_colors[idx - 1];
  528|   284k|      for (; idx < PALETTE_MAX_SIZE + n; ++idx) {
  ------------------
  |  |   63|   284k|#define PALETTE_MAX_SIZE 8
  ------------------
  |  Branch (528:14): [True: 220k, False: 63.8k]
  ------------------
  529|   220k|        assert(range >= 0);
  530|   220k|        const int delta = aom_read_literal(r, bits, ACCT_STR);
  ------------------
  |  |   47|   220k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  531|   220k|        pmi->palette_colors[idx] = clamp(pmi->palette_colors[idx - 1] + delta,
  532|   220k|                                         0, (1 << bit_depth) - 1);
  533|   220k|        range -= (pmi->palette_colors[idx] - pmi->palette_colors[idx - 1]);
  534|   220k|        bits = AOMMIN(bits, aom_ceil_log2(range));
  ------------------
  |  |   34|   220k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 45.1k, False: 175k]
  |  |  ------------------
  ------------------
  535|   220k|      }
  536|  63.8k|    }
  537|  66.6k|    merge_colors(pmi->palette_colors + PALETTE_MAX_SIZE, cached_colors, n,
  ------------------
  |  |   63|  66.6k|#define PALETTE_MAX_SIZE 8
  ------------------
  538|  66.6k|                 n_cached_colors);
  539|  66.6k|  } else {
  540|  6.43k|    memcpy(pmi->palette_colors + PALETTE_MAX_SIZE, cached_colors,
  ------------------
  |  |   63|  6.43k|#define PALETTE_MAX_SIZE 8
  ------------------
  541|  6.43k|           n * sizeof(cached_colors[0]));
  542|  6.43k|  }
  543|       |
  544|       |  // V channel colors.
  545|  73.1k|  if (aom_read_bit(r, ACCT_STR)) {  // Delta encoding.
  ------------------
  |  |   43|  73.1k|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  |  |  |  Branch (43:3): [True: 30.2k, False: 42.8k]
  |  |  ------------------
  ------------------
  546|  30.2k|    const int min_bits_v = bit_depth - 4;
  547|  30.2k|    const int max_val = 1 << bit_depth;
  548|  30.2k|    int bits = min_bits_v + aom_read_literal(r, 2, ACCT_STR);
  ------------------
  |  |   47|  30.2k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  549|  30.2k|    pmi->palette_colors[2 * PALETTE_MAX_SIZE] =
  ------------------
  |  |   63|  30.2k|#define PALETTE_MAX_SIZE 8
  ------------------
  550|  30.2k|        aom_read_literal(r, bit_depth, ACCT_STR);
  ------------------
  |  |   47|  30.2k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  551|   131k|    for (int i = 1; i < n; ++i) {
  ------------------
  |  Branch (551:21): [True: 101k, False: 30.2k]
  ------------------
  552|   101k|      int delta = aom_read_literal(r, bits, ACCT_STR);
  ------------------
  |  |   47|   101k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  553|   101k|      if (delta && aom_read_bit(r, ACCT_STR)) delta = -delta;
  ------------------
  |  |   43|   100k|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  |  |  |  Branch (43:3): [True: 48.4k, False: 52.5k]
  |  |  ------------------
  ------------------
  |  Branch (553:11): [True: 100k, False: 345]
  ------------------
  554|   101k|      int val = (int)pmi->palette_colors[2 * PALETTE_MAX_SIZE + i - 1] + delta;
  ------------------
  |  |   63|   101k|#define PALETTE_MAX_SIZE 8
  ------------------
  555|   101k|      if (val < 0) val += max_val;
  ------------------
  |  Branch (555:11): [True: 16.1k, False: 85.1k]
  ------------------
  556|   101k|      if (val >= max_val) val -= max_val;
  ------------------
  |  Branch (556:11): [True: 11.7k, False: 89.6k]
  ------------------
  557|   101k|      pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] = val;
  ------------------
  |  |   63|   101k|#define PALETTE_MAX_SIZE 8
  ------------------
  558|   101k|    }
  559|  42.8k|  } else {
  560|   227k|    for (int i = 0; i < n; ++i) {
  ------------------
  |  Branch (560:21): [True: 184k, False: 42.8k]
  ------------------
  561|   184k|      pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] =
  ------------------
  |  |   63|   184k|#define PALETTE_MAX_SIZE 8
  ------------------
  562|   184k|          aom_read_literal(r, bit_depth, ACCT_STR);
  ------------------
  |  |   47|   184k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  563|   184k|    }
  564|  42.8k|  }
  565|  73.1k|}
decodemv.c:read_filter_intra_mode_info:
  609|  10.9M|                                        MACROBLOCKD *const xd, aom_reader *r) {
  610|  10.9M|  MB_MODE_INFO *const mbmi = xd->mi[0];
  611|  10.9M|  FILTER_INTRA_MODE_INFO *filter_intra_mode_info =
  612|  10.9M|      &mbmi->filter_intra_mode_info;
  613|       |
  614|  10.9M|  if (av1_filter_intra_allowed(cm, mbmi)) {
  ------------------
  |  Branch (614:7): [True: 2.15M, False: 8.83M]
  ------------------
  615|  2.15M|    filter_intra_mode_info->use_filter_intra = aom_read_symbol(
  ------------------
  |  |   51|  2.15M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  616|  2.15M|        r, xd->tile_ctx->filter_intra_cdfs[mbmi->bsize], 2, ACCT_STR);
  617|  2.15M|    if (filter_intra_mode_info->use_filter_intra) {
  ------------------
  |  Branch (617:9): [True: 1.27M, False: 888k]
  ------------------
  618|  1.27M|      filter_intra_mode_info->filter_intra_mode = aom_read_symbol(
  ------------------
  |  |   51|  1.27M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  619|  1.27M|          r, xd->tile_ctx->filter_intra_mode_cdf, FILTER_INTRA_MODES, ACCT_STR);
  620|  1.27M|    }
  621|  8.83M|  } else {
  622|  8.83M|    filter_intra_mode_info->use_filter_intra = 0;
  623|  8.83M|  }
  624|  10.9M|}
decodemv.c:intra_copy_frame_mvs:
 1554|  4.75M|                                 int x_mis, int y_mis) {
 1555|  4.75M|  const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, 1);
  ------------------
  |  |   41|  4.75M|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
 1556|  4.75M|  MV_REF *frame_mvs =
 1557|  4.75M|      cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1);
 1558|  4.75M|  x_mis = ROUND_POWER_OF_TWO(x_mis, 1);
  ------------------
  |  |   41|  4.75M|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
 1559|  4.75M|  y_mis = ROUND_POWER_OF_TWO(y_mis, 1);
  ------------------
  |  |   41|  4.75M|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
 1560|       |
 1561|  15.2M|  for (int h = 0; h < y_mis; h++) {
  ------------------
  |  Branch (1561:19): [True: 10.4M, False: 4.75M]
  ------------------
 1562|  10.4M|    MV_REF *mv = frame_mvs;
 1563|  57.3M|    for (int w = 0; w < x_mis; w++) {
  ------------------
  |  Branch (1563:21): [True: 46.8M, False: 10.4M]
  ------------------
 1564|  46.8M|      mv->ref_frame = NONE_FRAME;
 1565|  46.8M|      mv++;
 1566|  46.8M|    }
 1567|  10.4M|    frame_mvs += frame_mvs_stride;
 1568|  10.4M|  }
 1569|  4.75M|}
decodemv.c:read_inter_frame_mode_info:
 1513|  6.36M|                                       DecoderCodingBlock *dcb, aom_reader *r) {
 1514|  6.36M|  AV1_COMMON *const cm = &pbi->common;
 1515|  6.36M|  MACROBLOCKD *const xd = &dcb->xd;
 1516|  6.36M|  MB_MODE_INFO *const mbmi = xd->mi[0];
 1517|  6.36M|  int inter_block = 1;
 1518|       |
 1519|  6.36M|  mbmi->mv[0].as_int = 0;
 1520|  6.36M|  mbmi->mv[1].as_int = 0;
 1521|  6.36M|  mbmi->segment_id = read_inter_segment_id(cm, xd, 1, r);
 1522|       |
 1523|  6.36M|  mbmi->skip_mode = read_skip_mode(cm, xd, mbmi->segment_id, r);
 1524|       |
 1525|  6.36M|  if (mbmi->skip_mode)
  ------------------
  |  Branch (1525:7): [True: 108k, False: 6.25M]
  ------------------
 1526|   108k|    mbmi->skip_txfm = 1;
 1527|  6.25M|  else
 1528|  6.25M|    mbmi->skip_txfm = read_skip_txfm(cm, xd, mbmi->segment_id, r);
 1529|       |
 1530|  6.36M|  if (!cm->seg.segid_preskip)
  ------------------
  |  Branch (1530:7): [True: 5.70M, False: 654k]
  ------------------
 1531|  5.70M|    mbmi->segment_id = read_inter_segment_id(cm, xd, 0, r);
 1532|       |
 1533|  6.36M|  read_cdef(cm, r, xd);
 1534|       |
 1535|  6.36M|  read_delta_q_params(cm, xd, r);
 1536|       |
 1537|  6.36M|  if (!mbmi->skip_mode)
  ------------------
  |  Branch (1537:7): [True: 6.24M, False: 110k]
  ------------------
 1538|  6.24M|    inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
 1539|       |
 1540|  6.36M|  mbmi->current_qindex = xd->current_base_qindex;
 1541|       |
 1542|  6.36M|  xd->above_txfm_context =
 1543|  6.36M|      cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col;
 1544|  6.36M|  xd->left_txfm_context =
 1545|  6.36M|      xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK);
  ------------------
  |  |   50|  6.36M|#define MAX_MIB_MASK (MAX_MIB_SIZE - 1)
  |  |  ------------------
  |  |  |  |   44|  6.36M|#define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2)
  |  |  |  |  ------------------
  |  |  |  |  |  |   43|  6.36M|#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   31|  6.36M|#define MAX_SB_SIZE_LOG2 7
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |               #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   39|  6.36M|#define MI_SIZE_LOG2 2
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
 1546|       |
 1547|  6.36M|  if (inter_block)
  ------------------
  |  Branch (1547:7): [True: 4.55M, False: 1.80M]
  ------------------
 1548|  4.55M|    read_inter_block_mode_info(pbi, dcb, mbmi, r);
 1549|  1.80M|  else
 1550|  1.80M|    read_intra_block_mode_info(cm, xd, mbmi, r);
 1551|  6.36M|}
decodemv.c:read_inter_segment_id:
  364|  12.0M|                                 int preskip, aom_reader *r) {
  365|  12.0M|  struct segmentation *const seg = &cm->seg;
  366|  12.0M|  const CommonModeInfoParams *const mi_params = &cm->mi_params;
  367|  12.0M|  MB_MODE_INFO *const mbmi = xd->mi[0];
  368|  12.0M|  const int mi_row = xd->mi_row;
  369|  12.0M|  const int mi_col = xd->mi_col;
  370|  12.0M|  const int mi_offset = mi_row * mi_params->mi_cols + mi_col;
  371|  12.0M|  const int bw = mi_size_wide[mbmi->bsize];
  372|  12.0M|  const int bh = mi_size_high[mbmi->bsize];
  373|       |
  374|       |  // TODO(slavarnway): move x_mis, y_mis into xd ?????
  375|  12.0M|  const int x_mis = AOMMIN(mi_params->mi_cols - mi_col, bw);
  ------------------
  |  |   34|  12.0M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 39.9k, False: 12.0M]
  |  |  ------------------
  ------------------
  376|  12.0M|  const int y_mis = AOMMIN(mi_params->mi_rows - mi_row, bh);
  ------------------
  |  |   34|  12.0M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 196k, False: 11.8M]
  |  |  ------------------
  ------------------
  377|       |
  378|  12.0M|  if (!seg->enabled) return 0;  // Default for disabled segmentation
  ------------------
  |  Branch (378:7): [True: 11.0M, False: 1.04M]
  ------------------
  379|       |
  380|  1.04M|  if (!seg->update_map) {
  ------------------
  |  Branch (380:7): [True: 728k, False: 319k]
  ------------------
  381|   728k|    copy_segment_id(mi_params, cm->last_frame_seg_map, cm->cur_frame->seg_map,
  382|   728k|                    mi_offset, x_mis, y_mis);
  383|   728k|    return get_predicted_segment_id(cm, mi_offset, x_mis, y_mis);
  384|   728k|  }
  385|       |
  386|   319k|  uint8_t segment_id;
  387|   319k|  const int mi_stride = cm->mi_params.mi_cols;
  388|   319k|  if (preskip) {
  ------------------
  |  Branch (388:7): [True: 220k, False: 99.0k]
  ------------------
  389|   220k|    if (!seg->segid_preskip) return 0;
  ------------------
  |  Branch (389:9): [True: 100k, False: 119k]
  ------------------
  390|   220k|  } else {
  391|  99.0k|    if (mbmi->skip_txfm) {
  ------------------
  |  Branch (391:9): [True: 38.9k, False: 60.0k]
  ------------------
  392|  38.9k|      if (seg->temporal_update) {
  ------------------
  |  Branch (392:11): [True: 403, False: 38.5k]
  ------------------
  393|    403|        mbmi->seg_id_predicted = 0;
  394|    403|      }
  395|  38.9k|      segment_id = read_segment_id(cm, xd, r, 1);
  396|  38.9k|      set_segment_id(cm->cur_frame->seg_map, mi_offset, x_mis, y_mis, mi_stride,
  397|  38.9k|                     segment_id);
  398|  38.9k|      return segment_id;
  399|  38.9k|    }
  400|  99.0k|  }
  401|       |
  402|   179k|  if (seg->temporal_update) {
  ------------------
  |  Branch (402:7): [True: 37.8k, False: 142k]
  ------------------
  403|  37.8k|    const uint8_t ctx = av1_get_pred_context_seg_id(xd);
  404|  37.8k|    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
  405|  37.8k|    struct segmentation_probs *const segp = &ec_ctx->seg;
  406|  37.8k|    aom_cdf_prob *pred_cdf = segp->pred_cdf[ctx];
  407|  37.8k|    mbmi->seg_id_predicted = aom_read_symbol(r, pred_cdf, 2, ACCT_STR);
  ------------------
  |  |   51|  37.8k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  408|  37.8k|    if (mbmi->seg_id_predicted) {
  ------------------
  |  Branch (408:9): [True: 22.7k, False: 15.1k]
  ------------------
  409|  22.7k|      segment_id = get_predicted_segment_id(cm, mi_offset, x_mis, y_mis);
  410|  22.7k|    } else {
  411|  15.1k|      segment_id = read_segment_id(cm, xd, r, 0);
  412|  15.1k|    }
  413|   142k|  } else {
  414|   142k|    segment_id = read_segment_id(cm, xd, r, 0);
  415|   142k|  }
  416|   179k|  set_segment_id(cm->cur_frame->seg_map, mi_offset, x_mis, y_mis, mi_stride,
  417|   179k|                 segment_id);
  418|   179k|  return segment_id;
  419|   319k|}
decodemv.c:copy_segment_id:
  339|   728k|                            int x_mis, int y_mis) {
  340|   728k|  const int stride = mi_params->mi_cols;
  341|   728k|  if (last_segment_ids) {
  ------------------
  |  Branch (341:7): [True: 263k, False: 465k]
  ------------------
  342|   263k|    assert(last_segment_ids != current_segment_ids);
  343|  1.76M|    for (int y = 0; y < y_mis; y++) {
  ------------------
  |  Branch (343:21): [True: 1.49M, False: 263k]
  ------------------
  344|  1.49M|      memcpy(&current_segment_ids[mi_offset + y * stride],
  345|  1.49M|             &last_segment_ids[mi_offset + y * stride],
  346|  1.49M|             sizeof(current_segment_ids[0]) * x_mis);
  347|  1.49M|    }
  348|   465k|  } else {
  349|  2.09M|    for (int y = 0; y < y_mis; y++) {
  ------------------
  |  Branch (349:21): [True: 1.63M, False: 465k]
  ------------------
  350|  1.63M|      memset(&current_segment_ids[mi_offset + y * stride], 0,
  351|  1.63M|             sizeof(current_segment_ids[0]) * x_mis);
  352|  1.63M|    }
  353|   465k|  }
  354|   728k|}
decodemv.c:get_predicted_segment_id:
  357|   752k|                                    int x_mis, int y_mis) {
  358|   752k|  return cm->last_frame_seg_map ? dec_get_segment_id(cm, cm->last_frame_seg_map,
  ------------------
  |  Branch (358:10): [True: 284k, False: 467k]
  ------------------
  359|   284k|                                                     mi_offset, x_mis, y_mis)
  360|   752k|                                : 0;
  361|   752k|}
decodemv.c:dec_get_segment_id:
  302|   284k|                              int mi_offset, int x_mis, int y_mis) {
  303|   284k|  int segment_id = INT_MAX;
  304|       |
  305|  1.82M|  for (int y = 0; y < y_mis; y++)
  ------------------
  |  Branch (305:19): [True: 1.54M, False: 284k]
  ------------------
  306|  21.8M|    for (int x = 0; x < x_mis; x++)
  ------------------
  |  Branch (306:21): [True: 20.3M, False: 1.54M]
  ------------------
  307|  20.3M|      segment_id = AOMMIN(
  ------------------
  |  |   34|  20.3M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 3.00M, False: 17.3M]
  |  |  ------------------
  ------------------
  308|   284k|          segment_id, segment_ids[mi_offset + y * cm->mi_params.mi_cols + x]);
  309|       |
  310|   284k|  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
  311|   284k|  return segment_id;
  312|   284k|}
decodemv.c:read_skip_mode:
  422|  6.36M|                          aom_reader *r) {
  423|  6.36M|  if (!cm->current_frame.skip_mode_info.skip_mode_flag) return 0;
  ------------------
  |  Branch (423:7): [True: 6.01M, False: 346k]
  ------------------
  424|       |
  425|   346k|  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
  ------------------
  |  Branch (425:7): [True: 37.9k, False: 308k]
  ------------------
  426|  37.9k|    return 0;
  427|  37.9k|  }
  428|       |
  429|   308k|  if (!is_comp_ref_allowed(xd->mi[0]->bsize)) return 0;
  ------------------
  |  Branch (429:7): [True: 74.7k, False: 233k]
  ------------------
  430|       |
  431|   233k|  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME) ||
  ------------------
  |  Branch (431:7): [True: 1.49k, False: 232k]
  ------------------
  432|   233k|      segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
  ------------------
  |  Branch (432:7): [True: 841, False: 231k]
  ------------------
  433|       |    // These features imply single-reference mode, while skip mode implies
  434|       |    // compound reference. Hence, the two are mutually exclusive.
  435|       |    // In other words, skip_mode is implicitly 0 here.
  436|  1.44k|    return 0;
  437|  1.44k|  }
  438|       |
  439|   232k|  const int ctx = av1_get_skip_mode_context(xd);
  440|   232k|  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
  441|   232k|  const int skip_mode =
  442|   232k|      aom_read_symbol(r, ec_ctx->skip_mode_cdfs[ctx], 2, ACCT_STR);
  ------------------
  |  |   51|   232k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  443|   232k|  return skip_mode;
  444|   233k|}
decodemv.c:read_is_inter_block:
 1224|  6.25M|                               int segment_id, aom_reader *r) {
 1225|  6.25M|  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
  ------------------
  |  Branch (1225:7): [True: 122k, False: 6.12M]
  ------------------
 1226|   122k|    const int frame = get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
 1227|   122k|    if (frame < LAST_FRAME) return 0;
  ------------------
  |  Branch (1227:9): [True: 19.2k, False: 103k]
  ------------------
 1228|   103k|    return frame != INTRA_FRAME;
 1229|   122k|  }
 1230|  6.12M|  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
  ------------------
  |  Branch (1230:7): [True: 421k, False: 5.70M]
  ------------------
 1231|   421k|    return 1;
 1232|   421k|  }
 1233|  5.70M|  const int ctx = av1_get_intra_inter_context(xd);
 1234|  5.70M|  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 1235|  5.70M|  const int is_inter =
 1236|  5.70M|      aom_read_symbol(r, ec_ctx->intra_inter_cdf[ctx], 2, ACCT_STR);
  ------------------
  |  |   51|  5.70M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1237|  5.70M|  return is_inter;
 1238|  6.12M|}
decodemv.c:read_inter_block_mode_info:
 1276|  4.55M|                                       aom_reader *r) {
 1277|  4.55M|  AV1_COMMON *const cm = &pbi->common;
 1278|  4.55M|  FeatureFlags *const features = &cm->features;
 1279|  4.55M|  const BLOCK_SIZE bsize = mbmi->bsize;
 1280|  4.55M|  const int allow_hp = features->allow_high_precision_mv;
 1281|  4.55M|  int_mv nearestmv[2], nearmv[2];
 1282|  4.55M|  int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES] = { { { 0 } } };
 1283|  4.55M|  int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
 1284|  4.55M|  int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
 1285|  4.55M|  MACROBLOCKD *const xd = &dcb->xd;
 1286|  4.55M|  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 1287|       |
 1288|  4.55M|  mbmi->uv_mode = UV_DC_PRED;
 1289|  4.55M|  mbmi->palette_mode_info.palette_size[0] = 0;
 1290|  4.55M|  mbmi->palette_mode_info.palette_size[1] = 0;
 1291|       |
 1292|  4.55M|  av1_collect_neighbors_ref_counts(xd);
 1293|       |
 1294|  4.55M|  read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame);
 1295|  4.55M|  const int is_compound = has_second_ref(mbmi);
 1296|       |
 1297|  4.55M|  const MV_REFERENCE_FRAME ref_frame = av1_ref_frame_type(mbmi->ref_frame);
 1298|  4.55M|  av1_find_mv_refs(cm, xd, mbmi, ref_frame, dcb->ref_mv_count, xd->ref_mv_stack,
 1299|  4.55M|                   xd->weight, ref_mvs, /*global_mvs=*/NULL, inter_mode_ctx);
 1300|       |
 1301|  4.55M|  mbmi->ref_mv_idx = 0;
 1302|       |
 1303|  4.55M|  if (mbmi->skip_mode) {
  ------------------
  |  Branch (1303:7): [True: 108k, False: 4.44M]
  ------------------
 1304|   108k|    assert(is_compound);
 1305|   108k|    mbmi->mode = NEAREST_NEARESTMV;
 1306|  4.44M|  } else {
 1307|  4.44M|    if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) ||
  ------------------
  |  Branch (1307:9): [True: 575k, False: 3.86M]
  ------------------
 1308|  4.44M|        segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_GLOBALMV)) {
  ------------------
  |  Branch (1308:9): [True: 10.2k, False: 3.85M]
  ------------------
 1309|   585k|      mbmi->mode = GLOBALMV;
 1310|  3.85M|    } else {
 1311|  3.85M|      const int mode_ctx =
 1312|  3.85M|          av1_mode_context_analyzer(inter_mode_ctx, mbmi->ref_frame);
 1313|  3.85M|      if (is_compound)
  ------------------
  |  Branch (1313:11): [True: 524k, False: 3.33M]
  ------------------
 1314|   524k|        mbmi->mode = read_inter_compound_mode(xd, r, mode_ctx);
 1315|  3.33M|      else
 1316|  3.33M|        mbmi->mode = read_inter_mode(ec_ctx, r, mode_ctx);
 1317|  3.85M|      if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV ||
  ------------------
  |  Branch (1317:11): [True: 1.92M, False: 1.93M]
  |  Branch (1317:34): [True: 86.0k, False: 1.84M]
  ------------------
 1318|  3.85M|          have_nearmv_in_inter_mode(mbmi->mode))
  ------------------
  |  Branch (1318:11): [True: 460k, False: 1.38M]
  ------------------
 1319|  2.47M|        read_drl_idx(ec_ctx, dcb, mbmi, r);
 1320|  3.85M|    }
 1321|  4.44M|  }
 1322|       |
 1323|  4.55M|  if (is_compound != is_inter_compound_mode(mbmi->mode)) {
  ------------------
  |  Branch (1323:7): [True: 0, False: 4.55M]
  ------------------
 1324|      0|    aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
 1325|      0|                       "Prediction mode %d invalid with ref frame %d %d",
 1326|      0|                       mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
 1327|      0|  }
 1328|       |
 1329|  4.55M|  if (!is_compound && mbmi->mode != GLOBALMV) {
  ------------------
  |  Branch (1329:7): [True: 3.92M, False: 632k]
  |  Branch (1329:23): [True: 3.12M, False: 792k]
  ------------------
 1330|  3.12M|    av1_find_best_ref_mvs(allow_hp, ref_mvs[mbmi->ref_frame[0]], &nearestmv[0],
 1331|  3.12M|                          &nearmv[0], features->cur_frame_force_integer_mv);
 1332|  3.12M|  }
 1333|       |
 1334|  4.55M|  if (is_compound && mbmi->mode != GLOBAL_GLOBALMV) {
  ------------------
  |  Branch (1334:7): [True: 633k, False: 3.91M]
  |  Branch (1334:22): [True: 582k, False: 50.9k]
  ------------------
 1335|   582k|    const int ref_mv_idx = mbmi->ref_mv_idx + 1;
 1336|   582k|    nearestmv[0] = xd->ref_mv_stack[ref_frame][0].this_mv;
 1337|   582k|    nearestmv[1] = xd->ref_mv_stack[ref_frame][0].comp_mv;
 1338|   582k|    nearmv[0] = xd->ref_mv_stack[ref_frame][ref_mv_idx].this_mv;
 1339|   582k|    nearmv[1] = xd->ref_mv_stack[ref_frame][ref_mv_idx].comp_mv;
 1340|   582k|    lower_mv_precision(&nearestmv[0].as_mv, allow_hp,
 1341|   582k|                       features->cur_frame_force_integer_mv);
 1342|   582k|    lower_mv_precision(&nearestmv[1].as_mv, allow_hp,
 1343|   582k|                       features->cur_frame_force_integer_mv);
 1344|   582k|    lower_mv_precision(&nearmv[0].as_mv, allow_hp,
 1345|   582k|                       features->cur_frame_force_integer_mv);
 1346|   582k|    lower_mv_precision(&nearmv[1].as_mv, allow_hp,
 1347|   582k|                       features->cur_frame_force_integer_mv);
 1348|  3.96M|  } else if (mbmi->ref_mv_idx > 0 && mbmi->mode == NEARMV) {
  ------------------
  |  Branch (1348:14): [True: 750k, False: 3.21M]
  |  Branch (1348:38): [True: 55.2k, False: 694k]
  ------------------
 1349|  55.2k|    nearmv[0] =
 1350|  55.2k|        xd->ref_mv_stack[mbmi->ref_frame[0]][1 + mbmi->ref_mv_idx].this_mv;
 1351|  55.2k|  }
 1352|       |
 1353|  4.55M|  int_mv ref_mv[2] = { nearestmv[0], nearestmv[1] };
 1354|       |
 1355|  4.55M|  if (is_compound) {
  ------------------
  |  Branch (1355:7): [True: 634k, False: 3.91M]
  ------------------
 1356|   634k|    int ref_mv_idx = mbmi->ref_mv_idx;
 1357|       |    // Special case: NEAR_NEWMV and NEW_NEARMV modes use
 1358|       |    // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
 1359|       |    // mbmi->ref_mv_idx (like NEWMV)
 1360|   634k|    if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV)
  ------------------
  |  Branch (1360:9): [True: 32.5k, False: 601k]
  |  Branch (1360:37): [True: 12.7k, False: 589k]
  ------------------
 1361|  45.3k|      ref_mv_idx = 1 + mbmi->ref_mv_idx;
 1362|       |
 1363|       |    // TODO(jingning, yunqing): Do we need a lower_mv_precision() call here?
 1364|   634k|    if (compound_ref0_mode(mbmi->mode) == NEWMV)
  ------------------
  |  Branch (1364:9): [True: 122k, False: 511k]
  ------------------
 1365|   122k|      ref_mv[0] = xd->ref_mv_stack[ref_frame][ref_mv_idx].this_mv;
 1366|       |
 1367|   634k|    if (compound_ref1_mode(mbmi->mode) == NEWMV)
  ------------------
  |  Branch (1367:9): [True: 146k, False: 487k]
  ------------------
 1368|   146k|      ref_mv[1] = xd->ref_mv_stack[ref_frame][ref_mv_idx].comp_mv;
 1369|  3.91M|  } else {
 1370|  3.91M|    if (mbmi->mode == NEWMV) {
  ------------------
  |  Branch (1370:9): [True: 1.92M, False: 1.99M]
  ------------------
 1371|  1.92M|      if (dcb->ref_mv_count[ref_frame] > 1)
  ------------------
  |  Branch (1371:11): [True: 1.63M, False: 290k]
  ------------------
 1372|  1.63M|        ref_mv[0] = xd->ref_mv_stack[ref_frame][mbmi->ref_mv_idx].this_mv;
 1373|  1.92M|    }
 1374|  3.91M|  }
 1375|       |
 1376|  4.55M|  if (mbmi->skip_mode) assert(mbmi->mode == NEAREST_NEARESTMV);
  ------------------
  |  Branch (1376:7): [True: 108k, False: 4.44M]
  ------------------
 1377|       |
 1378|  4.55M|  const int mv_corrupted_flag =
 1379|  4.55M|      !assign_mv(cm, xd, mbmi->mode, mbmi->ref_frame, mbmi->mv, ref_mv,
 1380|  4.55M|                 nearestmv, nearmv, is_compound, allow_hp, r);
 1381|  4.55M|  aom_merge_corrupted_flag(&dcb->corrupted, mv_corrupted_flag);
 1382|       |
 1383|  4.55M|  mbmi->use_wedge_interintra = 0;
 1384|  4.55M|  if (cm->seq_params->enable_interintra_compound && !mbmi->skip_mode &&
  ------------------
  |  Branch (1384:7): [True: 4.31M, False: 241k]
  |  Branch (1384:53): [True: 4.20M, False: 104k]
  ------------------
 1385|  4.55M|      is_interintra_allowed(mbmi)) {
  ------------------
  |  Branch (1385:7): [True: 1.85M, False: 2.35M]
  ------------------
 1386|  1.85M|    const int bsize_group = size_group_lookup[bsize];
 1387|  1.85M|    const int interintra =
 1388|  1.85M|        aom_read_symbol(r, ec_ctx->interintra_cdf[bsize_group], 2, ACCT_STR);
  ------------------
  |  |   51|  1.85M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1389|  1.85M|    assert(mbmi->ref_frame[1] == NONE_FRAME);
 1390|  1.85M|    if (interintra) {
  ------------------
  |  Branch (1390:9): [True: 339k, False: 1.51M]
  ------------------
 1391|   339k|      const INTERINTRA_MODE interintra_mode =
 1392|   339k|          read_interintra_mode(xd, r, bsize_group);
 1393|   339k|      mbmi->ref_frame[1] = INTRA_FRAME;
 1394|   339k|      mbmi->interintra_mode = interintra_mode;
 1395|   339k|      mbmi->angle_delta[PLANE_TYPE_Y] = 0;
 1396|   339k|      mbmi->angle_delta[PLANE_TYPE_UV] = 0;
 1397|   339k|      mbmi->filter_intra_mode_info.use_filter_intra = 0;
 1398|   339k|      if (av1_is_wedge_used(bsize)) {
  ------------------
  |  Branch (1398:11): [True: 339k, False: 0]
  ------------------
 1399|   339k|        mbmi->use_wedge_interintra = aom_read_symbol(
  ------------------
  |  |   51|   339k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1400|   339k|            r, ec_ctx->wedge_interintra_cdf[bsize], 2, ACCT_STR);
 1401|   339k|        if (mbmi->use_wedge_interintra) {
  ------------------
  |  Branch (1401:13): [True: 97.7k, False: 242k]
  ------------------
 1402|  97.7k|          mbmi->interintra_wedge_index = (int8_t)aom_read_symbol(
  ------------------
  |  |   51|  97.7k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1403|  97.7k|              r, ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES, ACCT_STR);
 1404|  97.7k|        }
 1405|   339k|      }
 1406|   339k|    }
 1407|  1.85M|  }
 1408|       |
 1409|  9.73M|  for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
  ------------------
  |  Branch (1409:21): [True: 5.18M, False: 4.55M]
  ------------------
 1410|  5.18M|    const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
 1411|  5.18M|    xd->block_ref_scale_factors[ref] = get_ref_scale_factors_const(cm, frame);
 1412|  5.18M|  }
 1413|       |
 1414|  4.55M|  mbmi->motion_mode = SIMPLE_TRANSLATION;
 1415|  4.55M|  if (is_motion_variation_allowed_bsize(mbmi->bsize) && !mbmi->skip_mode &&
  ------------------
  |  Branch (1415:7): [True: 3.02M, False: 1.52M]
  |  Branch (1415:57): [True: 2.91M, False: 108k]
  ------------------
 1416|  4.55M|      !has_second_ref(mbmi)) {
  ------------------
  |  Branch (1416:7): [True: 2.39M, False: 524k]
  ------------------
 1417|  2.39M|    mbmi->num_proj_ref = av1_findSamples(cm, xd, pts, pts_inref);
 1418|  2.39M|  }
 1419|  4.55M|  av1_count_overlappable_neighbors(cm, xd);
 1420|       |
 1421|  4.55M|  if (mbmi->ref_frame[1] != INTRA_FRAME)
  ------------------
  |  Branch (1421:7): [True: 4.21M, False: 338k]
  ------------------
 1422|  4.21M|    mbmi->motion_mode = read_motion_mode(cm, xd, mbmi, r);
 1423|       |
 1424|       |  // init
 1425|  4.55M|  mbmi->comp_group_idx = 0;
 1426|  4.55M|  mbmi->compound_idx = 1;
 1427|  4.55M|  mbmi->interinter_comp.type = COMPOUND_AVERAGE;
 1428|       |
 1429|  4.55M|  if (has_second_ref(mbmi) && !mbmi->skip_mode) {
  ------------------
  |  Branch (1429:7): [True: 633k, False: 3.91M]
  |  Branch (1429:31): [True: 524k, False: 108k]
  ------------------
 1430|       |    // Read idx to indicate current compound inter prediction mode group
 1431|   524k|    const int masked_compound_used = is_any_masked_compound_used(bsize) &&
  ------------------
  |  Branch (1431:38): [True: 523k, False: 1.29k]
  ------------------
 1432|   524k|                                     cm->seq_params->enable_masked_compound;
  ------------------
  |  Branch (1432:38): [True: 500k, False: 22.7k]
  ------------------
 1433|       |
 1434|   524k|    if (masked_compound_used) {
  ------------------
  |  Branch (1434:9): [True: 500k, False: 24.0k]
  ------------------
 1435|   500k|      const int ctx_comp_group_idx = get_comp_group_idx_context(xd);
 1436|   500k|      mbmi->comp_group_idx = (uint8_t)aom_read_symbol(
  ------------------
  |  |   51|   500k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1437|   500k|          r, ec_ctx->comp_group_idx_cdf[ctx_comp_group_idx], 2, ACCT_STR);
 1438|   500k|    }
 1439|       |
 1440|   524k|    if (mbmi->comp_group_idx == 0) {
  ------------------
  |  Branch (1440:9): [True: 375k, False: 148k]
  ------------------
 1441|   375k|      if (cm->seq_params->order_hint_info.enable_dist_wtd_comp) {
  ------------------
  |  Branch (1441:11): [True: 337k, False: 38.8k]
  ------------------
 1442|   337k|        const int comp_index_ctx = get_comp_index_context(cm, xd);
 1443|   337k|        mbmi->compound_idx = (uint8_t)aom_read_symbol(
  ------------------
  |  |   51|   337k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1444|   337k|            r, ec_ctx->compound_index_cdf[comp_index_ctx], 2, ACCT_STR);
 1445|   337k|        mbmi->interinter_comp.type =
 1446|   337k|            mbmi->compound_idx ? COMPOUND_AVERAGE : COMPOUND_DISTWTD;
  ------------------
  |  Branch (1446:13): [True: 210k, False: 126k]
  ------------------
 1447|   337k|      } else {
 1448|       |        // Distance-weighted compound is disabled, so always use average
 1449|  38.8k|        mbmi->compound_idx = 1;
 1450|  38.8k|        mbmi->interinter_comp.type = COMPOUND_AVERAGE;
 1451|  38.8k|      }
 1452|   375k|    } else {
 1453|   148k|      assert(cm->current_frame.reference_mode != SINGLE_REFERENCE &&
 1454|   148k|             is_inter_compound_mode(mbmi->mode) &&
 1455|   148k|             mbmi->motion_mode == SIMPLE_TRANSLATION);
 1456|   148k|      assert(masked_compound_used);
 1457|       |
 1458|       |      // compound_diffwtd, wedge
 1459|   148k|      if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
  ------------------
  |  Branch (1459:11): [True: 128k, False: 19.2k]
  ------------------
 1460|   128k|        mbmi->interinter_comp.type =
 1461|   128k|            COMPOUND_WEDGE + aom_read_symbol(r,
  ------------------
  |  |   51|   128k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1462|   128k|                                             ec_ctx->compound_type_cdf[bsize],
 1463|   128k|                                             MASKED_COMPOUND_TYPES, ACCT_STR);
 1464|   128k|      } else {
 1465|  19.2k|        mbmi->interinter_comp.type = COMPOUND_DIFFWTD;
 1466|  19.2k|      }
 1467|       |
 1468|   148k|      if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
  ------------------
  |  Branch (1468:11): [True: 59.9k, False: 88.1k]
  ------------------
 1469|  59.9k|        assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
 1470|  59.9k|        mbmi->interinter_comp.wedge_index = (int8_t)aom_read_symbol(
  ------------------
  |  |   51|  59.9k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1471|  59.9k|            r, ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES, ACCT_STR);
 1472|  59.9k|        mbmi->interinter_comp.wedge_sign = (int8_t)aom_read_bit(r, ACCT_STR);
  ------------------
  |  |   43|  59.9k|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1473|  88.1k|      } else {
 1474|  88.1k|        assert(mbmi->interinter_comp.type == COMPOUND_DIFFWTD);
 1475|  88.2k|        mbmi->interinter_comp.mask_type =
 1476|  88.2k|            aom_read_literal(r, MAX_DIFFWTD_MASK_BITS, ACCT_STR);
  ------------------
  |  |   47|  88.2k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1477|  88.2k|      }
 1478|   148k|    }
 1479|   524k|  }
 1480|       |
 1481|  4.55M|  read_mb_interp_filter(xd, features->interp_filter,
 1482|  4.55M|                        cm->seq_params->enable_dual_filter, mbmi, r);
 1483|       |
 1484|  4.55M|  if (mbmi->motion_mode == WARPED_CAUSAL) {
  ------------------
  |  Branch (1484:7): [True: 305k, False: 4.24M]
  ------------------
 1485|   305k|    const int mi_row = xd->mi_row;
 1486|   305k|    const int mi_col = xd->mi_col;
 1487|   305k|    mbmi->wm_params.wmtype = DEFAULT_WMTYPE;
  ------------------
  |  |   32|   305k|#define DEFAULT_WMTYPE AFFINE
  ------------------
 1488|   305k|    mbmi->wm_params.invalid = 0;
 1489|       |
 1490|   305k|    if (mbmi->num_proj_ref > 1) {
  ------------------
  |  Branch (1490:9): [True: 239k, False: 66.5k]
  ------------------
 1491|   239k|      mbmi->num_proj_ref = av1_selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
 1492|   239k|                                             mbmi->num_proj_ref, bsize);
 1493|   239k|    }
 1494|       |
 1495|   305k|    if (av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
  ------------------
  |  Branch (1495:9): [True: 48.1k, False: 257k]
  ------------------
 1496|   305k|                            mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
 1497|   305k|                            &mbmi->wm_params, mi_row, mi_col)) {
 1498|       |#if WARPED_MOTION_DEBUG
 1499|       |      printf("Warning: unexpected warped model from aomenc\n");
 1500|       |#endif
 1501|  48.1k|      mbmi->wm_params.invalid = 1;
 1502|  48.1k|    }
 1503|   305k|  }
 1504|       |
 1505|  4.55M|  xd->cfl.store_y = store_cfl_required(cm, xd);
 1506|       |
 1507|       |#if DEC_MISMATCH_DEBUG
 1508|       |  dec_dump_logs(cm, mi, mi_row, mi_col, mode_ctx);
 1509|       |#endif  // DEC_MISMATCH_DEBUG
 1510|  4.55M|}
decodemv.c:read_ref_frames:
  940|  4.55M|                            MV_REFERENCE_FRAME ref_frame[2]) {
  941|  4.55M|  if (xd->mi[0]->skip_mode) {
  ------------------
  |  Branch (941:7): [True: 108k, False: 4.44M]
  ------------------
  942|   108k|    set_ref_frames_for_skip_mode(cm, ref_frame);
  943|   108k|    return;
  944|   108k|  }
  945|       |
  946|  4.44M|  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
  ------------------
  |  Branch (946:7): [True: 103k, False: 4.34M]
  ------------------
  947|   103k|    ref_frame[0] = (MV_REFERENCE_FRAME)get_segdata(&cm->seg, segment_id,
  948|   103k|                                                   SEG_LVL_REF_FRAME);
  949|   103k|    ref_frame[1] = NONE_FRAME;
  950|  4.34M|  } else if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP) ||
  ------------------
  |  Branch (950:14): [True: 486k, False: 3.85M]
  ------------------
  951|  4.34M|             segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
  ------------------
  |  Branch (951:14): [True: 7.84k, False: 3.84M]
  ------------------
  952|   491k|    ref_frame[0] = LAST_FRAME;
  953|   491k|    ref_frame[1] = NONE_FRAME;
  954|  3.85M|  } else {
  955|  3.85M|    const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r);
  956|       |
  957|  3.85M|    if (mode == COMPOUND_REFERENCE) {
  ------------------
  |  Branch (957:9): [True: 524k, False: 3.32M]
  ------------------
  958|   524k|      const COMP_REFERENCE_TYPE comp_ref_type = read_comp_reference_type(xd, r);
  959|       |
  960|   524k|      if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
  ------------------
  |  Branch (960:11): [True: 87.2k, False: 437k]
  ------------------
  961|  87.2k|        const int bit = READ_REF_BIT(uni_comp_ref_p);
  ------------------
  |  |  920|  87.2k|  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
  |  |  ------------------
  |  |  |  |   51|  87.2k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  ------------------
  962|  87.2k|        if (bit) {
  ------------------
  |  Branch (962:13): [True: 21.3k, False: 65.9k]
  ------------------
  963|  21.3k|          ref_frame[0] = BWDREF_FRAME;
  964|  21.3k|          ref_frame[1] = ALTREF_FRAME;
  965|  65.9k|        } else {
  966|  65.9k|          const int bit1 = READ_REF_BIT(uni_comp_ref_p1);
  ------------------
  |  |  920|  65.9k|  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
  |  |  ------------------
  |  |  |  |   51|  65.9k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  ------------------
  967|  65.9k|          if (bit1) {
  ------------------
  |  Branch (967:15): [True: 39.6k, False: 26.2k]
  ------------------
  968|  39.6k|            const int bit2 = READ_REF_BIT(uni_comp_ref_p2);
  ------------------
  |  |  920|  39.6k|  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
  |  |  ------------------
  |  |  |  |   51|  39.6k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  ------------------
  969|  39.6k|            if (bit2) {
  ------------------
  |  Branch (969:17): [True: 23.5k, False: 16.1k]
  ------------------
  970|  23.5k|              ref_frame[0] = LAST_FRAME;
  971|  23.5k|              ref_frame[1] = GOLDEN_FRAME;
  972|  23.5k|            } else {
  973|  16.1k|              ref_frame[0] = LAST_FRAME;
  974|  16.1k|              ref_frame[1] = LAST3_FRAME;
  975|  16.1k|            }
  976|  39.6k|          } else {
  977|  26.2k|            ref_frame[0] = LAST_FRAME;
  978|  26.2k|            ref_frame[1] = LAST2_FRAME;
  979|  26.2k|          }
  980|  65.9k|        }
  981|       |
  982|  87.2k|        return;
  983|  87.2k|      }
  984|       |
  985|   437k|      assert(comp_ref_type == BIDIR_COMP_REFERENCE);
  986|       |
  987|   437k|      const int idx = 1;
  988|   437k|      const int bit = READ_REF_BIT(comp_ref_p);
  ------------------
  |  |  920|   437k|  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
  |  |  ------------------
  |  |  |  |   51|   437k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  ------------------
  989|       |      // Decode forward references.
  990|   437k|      if (!bit) {
  ------------------
  |  Branch (990:11): [True: 335k, False: 101k]
  ------------------
  991|   335k|        const int bit1 = READ_REF_BIT(comp_ref_p1);
  ------------------
  |  |  920|   335k|  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
  |  |  ------------------
  |  |  |  |   51|   335k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  ------------------
  992|   335k|        ref_frame[!idx] = bit1 ? LAST2_FRAME : LAST_FRAME;
  ------------------
  |  Branch (992:27): [True: 21.7k, False: 313k]
  ------------------
  993|   335k|      } else {
  994|   101k|        const int bit2 = READ_REF_BIT(comp_ref_p2);
  ------------------
  |  |  920|   101k|  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
  |  |  ------------------
  |  |  |  |   51|   101k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  ------------------
  995|   101k|        ref_frame[!idx] = bit2 ? GOLDEN_FRAME : LAST3_FRAME;
  ------------------
  |  Branch (995:27): [True: 67.6k, False: 34.0k]
  ------------------
  996|   101k|      }
  997|       |
  998|       |      // Decode backward references.
  999|   437k|      const int bit_bwd = READ_REF_BIT(comp_bwdref_p);
  ------------------
  |  |  920|   437k|  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
  |  |  ------------------
  |  |  |  |   51|   437k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  ------------------
 1000|   437k|      if (!bit_bwd) {
  ------------------
  |  Branch (1000:11): [True: 183k, False: 253k]
  ------------------
 1001|   183k|        const int bit1_bwd = READ_REF_BIT(comp_bwdref_p1);
  ------------------
  |  |  920|   183k|  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
  |  |  ------------------
  |  |  |  |   51|   183k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  ------------------
 1002|   183k|        ref_frame[idx] = bit1_bwd ? ALTREF2_FRAME : BWDREF_FRAME;
  ------------------
  |  Branch (1002:26): [True: 100k, False: 83.0k]
  ------------------
 1003|   253k|      } else {
 1004|   253k|        ref_frame[idx] = ALTREF_FRAME;
 1005|   253k|      }
 1006|  3.32M|    } else if (mode == SINGLE_REFERENCE) {
  ------------------
  |  Branch (1006:16): [True: 3.32M, False: 1.22k]
  ------------------
 1007|  3.32M|      const int bit0 = READ_REF_BIT(single_ref_p1);
  ------------------
  |  |  920|  3.32M|  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
  |  |  ------------------
  |  |  |  |   51|  3.32M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  ------------------
 1008|  3.32M|      if (bit0) {
  ------------------
  |  Branch (1008:11): [True: 472k, False: 2.85M]
  ------------------
 1009|   472k|        const int bit1 = READ_REF_BIT(single_ref_p2);
  ------------------
  |  |  920|   472k|  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
  |  |  ------------------
  |  |  |  |   51|   472k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  ------------------
 1010|   472k|        if (!bit1) {
  ------------------
  |  Branch (1010:13): [True: 215k, False: 257k]
  ------------------
 1011|   215k|          const int bit5 = READ_REF_BIT(single_ref_p6);
  ------------------
  |  |  920|   215k|  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
  |  |  ------------------
  |  |  |  |   51|   215k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  ------------------
 1012|   215k|          ref_frame[0] = bit5 ? ALTREF2_FRAME : BWDREF_FRAME;
  ------------------
  |  Branch (1012:26): [True: 118k, False: 96.8k]
  ------------------
 1013|   257k|        } else {
 1014|   257k|          ref_frame[0] = ALTREF_FRAME;
 1015|   257k|        }
 1016|  2.85M|      } else {
 1017|  2.85M|        const int bit2 = READ_REF_BIT(single_ref_p3);
  ------------------
  |  |  920|  2.85M|  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
  |  |  ------------------
  |  |  |  |   51|  2.85M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  ------------------
 1018|  2.85M|        if (bit2) {
  ------------------
  |  Branch (1018:13): [True: 245k, False: 2.60M]
  ------------------
 1019|   245k|          const int bit4 = READ_REF_BIT(single_ref_p5);
  ------------------
  |  |  920|   245k|  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
  |  |  ------------------
  |  |  |  |   51|   245k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  ------------------
 1020|   245k|          ref_frame[0] = bit4 ? GOLDEN_FRAME : LAST3_FRAME;
  ------------------
  |  Branch (1020:26): [True: 168k, False: 76.6k]
  ------------------
 1021|  2.60M|        } else {
 1022|  2.60M|          const int bit3 = READ_REF_BIT(single_ref_p4);
  ------------------
  |  |  920|  2.60M|  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
  |  |  ------------------
  |  |  |  |   51|  2.60M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  |  |  ------------------
  ------------------
 1023|  2.60M|          ref_frame[0] = bit3 ? LAST2_FRAME : LAST_FRAME;
  ------------------
  |  Branch (1023:26): [True: 132k, False: 2.47M]
  ------------------
 1024|  2.60M|        }
 1025|  2.85M|      }
 1026|       |
 1027|  3.32M|      ref_frame[1] = NONE_FRAME;
 1028|  3.32M|    } else {
 1029|  1.22k|      assert(0 && "Invalid prediction mode.");
 1030|  1.22k|    }
 1031|  3.85M|  }
 1032|  4.44M|}
decodemv.c:set_ref_frames_for_skip_mode:
  932|   108k|                                         MV_REFERENCE_FRAME ref_frame[2]) {
  933|   108k|  ref_frame[0] = LAST_FRAME + cm->current_frame.skip_mode_info.ref_frame_idx_0;
  934|   108k|  ref_frame[1] = LAST_FRAME + cm->current_frame.skip_mode_info.ref_frame_idx_1;
  935|   108k|}
decodemv.c:read_block_reference_mode:
  906|  3.84M|                                                aom_reader *r) {
  907|  3.84M|  if (!is_comp_ref_allowed(xd->mi[0]->bsize)) return SINGLE_REFERENCE;
  ------------------
  |  Branch (907:7): [True: 1.31M, False: 2.53M]
  ------------------
  908|  2.53M|  if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
  ------------------
  |  Branch (908:7): [True: 989k, False: 1.54M]
  ------------------
  909|   989k|    const int ctx = av1_get_reference_mode_context(xd);
  910|   989k|    const REFERENCE_MODE mode = (REFERENCE_MODE)aom_read_symbol(
  ------------------
  |  |   51|   989k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  911|   989k|        r, xd->tile_ctx->comp_inter_cdf[ctx], 2, ACCT_STR);
  912|   989k|    return mode;  // SINGLE_REFERENCE or COMPOUND_REFERENCE
  913|  1.54M|  } else {
  914|  1.54M|    assert(cm->current_frame.reference_mode == SINGLE_REFERENCE);
  915|  1.55M|    return cm->current_frame.reference_mode;
  916|  1.54M|  }
  917|  2.53M|}
decodemv.c:read_comp_reference_type:
  923|   524k|                                                    aom_reader *r) {
  924|   524k|  const int ctx = av1_get_comp_reference_type_context(xd);
  925|   524k|  const COMP_REFERENCE_TYPE comp_ref_type =
  926|   524k|      (COMP_REFERENCE_TYPE)aom_read_symbol(
  ------------------
  |  |   51|   524k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  927|   524k|          r, xd->tile_ctx->comp_ref_type_cdf[ctx], 2, ACCT_STR);
  928|   524k|  return comp_ref_type;  // UNIDIR_COMP_REFERENCE or BIDIR_COMP_REFERENCE
  929|   524k|}
decodemv.c:read_inter_compound_mode:
  250|   524k|                                                int16_t ctx) {
  251|   524k|  const int mode =
  252|   524k|      aom_read_symbol(r, xd->tile_ctx->inter_compound_mode_cdf[ctx],
  ------------------
  |  |   51|   524k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  253|   524k|                      INTER_COMPOUND_MODES, ACCT_STR);
  254|   524k|  assert(is_inter_compound_mode(NEAREST_NEARESTMV + mode));
  255|   524k|  return NEAREST_NEARESTMV + mode;
  256|   524k|}
decodemv.c:read_inter_mode:
  178|  3.33M|                                       int16_t ctx) {
  179|  3.33M|  int16_t mode_ctx = ctx & NEWMV_CTX_MASK;
  ------------------
  |  |  490|  3.33M|#define NEWMV_CTX_MASK ((1 << GLOBALMV_OFFSET) - 1)
  |  |  ------------------
  |  |  |  |  487|  3.33M|#define GLOBALMV_OFFSET 3
  |  |  ------------------
  ------------------
  180|  3.33M|  int is_newmv, is_zeromv, is_refmv;
  181|  3.33M|  is_newmv = aom_read_symbol(r, ec_ctx->newmv_cdf[mode_ctx], 2, ACCT_STR) == 0;
  ------------------
  |  |   51|  3.33M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  182|  3.33M|  if (is_newmv) return NEWMV;
  ------------------
  |  Branch (182:7): [True: 1.92M, False: 1.41M]
  ------------------
  183|       |
  184|  1.41M|  mode_ctx = (ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
  ------------------
  |  |  487|  1.41M|#define GLOBALMV_OFFSET 3
  ------------------
                mode_ctx = (ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
  ------------------
  |  |  491|  1.41M|#define GLOBALMV_CTX_MASK ((1 << (REFMV_OFFSET - GLOBALMV_OFFSET)) - 1)
  |  |  ------------------
  |  |  |  |  488|  1.41M|#define REFMV_OFFSET 4
  |  |  ------------------
  |  |               #define GLOBALMV_CTX_MASK ((1 << (REFMV_OFFSET - GLOBALMV_OFFSET)) - 1)
  |  |  ------------------
  |  |  |  |  487|  1.41M|#define GLOBALMV_OFFSET 3
  |  |  ------------------
  ------------------
  185|  1.41M|  is_zeromv =
  186|  1.41M|      aom_read_symbol(r, ec_ctx->zeromv_cdf[mode_ctx], 2, ACCT_STR) == 0;
  ------------------
  |  |   51|  1.41M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  187|  1.41M|  if (is_zeromv) return GLOBALMV;
  ------------------
  |  Branch (187:7): [True: 207k, False: 1.20M]
  ------------------
  188|       |
  189|  1.20M|  mode_ctx = (ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
  ------------------
  |  |  488|  1.20M|#define REFMV_OFFSET 4
  ------------------
                mode_ctx = (ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
  ------------------
  |  |  492|  1.20M|#define REFMV_CTX_MASK ((1 << (8 - REFMV_OFFSET)) - 1)
  |  |  ------------------
  |  |  |  |  488|  1.20M|#define REFMV_OFFSET 4
  |  |  ------------------
  ------------------
  190|  1.20M|  is_refmv = aom_read_symbol(r, ec_ctx->refmv_cdf[mode_ctx], 2, ACCT_STR) == 0;
  ------------------
  |  |   51|  1.20M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  191|  1.20M|  if (is_refmv)
  ------------------
  |  Branch (191:7): [True: 886k, False: 315k]
  ------------------
  192|   886k|    return NEARESTMV;
  193|   315k|  else
  194|   315k|    return NEARMV;
  195|  1.20M|}
decodemv.c:read_drl_idx:
  198|  2.47M|                         MB_MODE_INFO *mbmi, aom_reader *r) {
  199|  2.47M|  MACROBLOCKD *const xd = &dcb->xd;
  200|  2.47M|  uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
  201|  2.47M|  mbmi->ref_mv_idx = 0;
  202|  2.47M|  if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
  ------------------
  |  Branch (202:7): [True: 1.92M, False: 548k]
  |  Branch (202:30): [True: 86.4k, False: 461k]
  ------------------
  203|  3.76M|    for (int idx = 0; idx < 2; ++idx) {
  ------------------
  |  Branch (203:23): [True: 3.03M, False: 730k]
  ------------------
  204|  3.03M|      if (dcb->ref_mv_count[ref_frame_type] > idx + 1) {
  ------------------
  |  Branch (204:11): [True: 2.25M, False: 777k]
  ------------------
  205|  2.25M|        uint8_t drl_ctx = av1_drl_ctx(xd->weight[ref_frame_type], idx);
  206|  2.25M|        int drl_idx = aom_read_symbol(r, ec_ctx->drl_cdf[drl_ctx], 2, ACCT_STR);
  ------------------
  |  |   51|  2.25M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  207|  2.25M|        mbmi->ref_mv_idx = idx + drl_idx;
  208|  2.25M|        if (!drl_idx) return;
  ------------------
  |  Branch (208:13): [True: 1.28M, False: 977k]
  ------------------
  209|  2.25M|      }
  210|  3.03M|    }
  211|  2.01M|  }
  212|  1.19M|  if (have_nearmv_in_inter_mode(mbmi->mode)) {
  ------------------
  |  Branch (212:7): [True: 461k, False: 729k]
  ------------------
  213|       |    // Offset the NEARESTMV mode.
  214|       |    // TODO(jingning): Unify the two syntax decoding loops after the NEARESTMV
  215|       |    // mode is factored in.
  216|  1.15M|    for (int idx = 1; idx < 3; ++idx) {
  ------------------
  |  Branch (216:23): [True: 819k, False: 337k]
  ------------------
  217|   819k|      if (dcb->ref_mv_count[ref_frame_type] > idx + 1) {
  ------------------
  |  Branch (217:11): [True: 210k, False: 608k]
  ------------------
  218|   210k|        uint8_t drl_ctx = av1_drl_ctx(xd->weight[ref_frame_type], idx);
  219|   210k|        int drl_idx = aom_read_symbol(r, ec_ctx->drl_cdf[drl_ctx], 2, ACCT_STR);
  ------------------
  |  |   51|   210k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  220|   210k|        mbmi->ref_mv_idx = idx + drl_idx - 1;
  221|   210k|        if (!drl_idx) return;
  ------------------
  |  Branch (221:13): [True: 124k, False: 86.7k]
  ------------------
  222|   210k|      }
  223|   819k|    }
  224|   461k|  }
  225|  1.19M|}
decodemv.c:assign_mv:
 1119|  4.55M|                            aom_reader *r) {
 1120|  4.55M|  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 1121|  4.55M|  MB_MODE_INFO *mbmi = xd->mi[0];
 1122|  4.55M|  BLOCK_SIZE bsize = mbmi->bsize;
 1123|  4.55M|  FeatureFlags *const features = &cm->features;
 1124|  4.55M|  if (features->cur_frame_force_integer_mv) {
  ------------------
  |  Branch (1124:7): [True: 215k, False: 4.33M]
  ------------------
 1125|   215k|    allow_hp = MV_SUBPEL_NONE;
 1126|   215k|  }
 1127|  4.55M|  switch (mode) {
 1128|  1.92M|    case NEWMV: {
  ------------------
  |  Branch (1128:5): [True: 1.92M, False: 2.62M]
  ------------------
 1129|  1.92M|      nmv_context *const nmvc = &ec_ctx->nmvc;
 1130|  1.92M|      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp);
 1131|  1.92M|      break;
 1132|      0|    }
 1133|   886k|    case NEARESTMV: {
  ------------------
  |  Branch (1133:5): [True: 886k, False: 3.66M]
  ------------------
 1134|   886k|      mv[0].as_int = nearest_mv[0].as_int;
 1135|   886k|      break;
 1136|      0|    }
 1137|   317k|    case NEARMV: {
  ------------------
  |  Branch (1137:5): [True: 317k, False: 4.23M]
  ------------------
 1138|   317k|      mv[0].as_int = near_mv[0].as_int;
 1139|   317k|      break;
 1140|      0|    }
 1141|   793k|    case GLOBALMV: {
  ------------------
  |  Branch (1141:5): [True: 793k, False: 3.76M]
  ------------------
 1142|   793k|      mv[0].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[0]],
 1143|   793k|                                          features->allow_high_precision_mv,
 1144|   793k|                                          bsize, xd->mi_col, xd->mi_row,
 1145|   793k|                                          features->cur_frame_force_integer_mv)
 1146|   793k|                         .as_int;
 1147|   793k|      break;
 1148|      0|    }
 1149|  86.5k|    case NEW_NEWMV: {
  ------------------
  |  Branch (1149:5): [True: 86.5k, False: 4.46M]
  ------------------
 1150|  86.5k|      assert(is_compound);
 1151|   259k|      for (int i = 0; i < 2; ++i) {
  ------------------
  |  Branch (1151:23): [True: 172k, False: 86.4k]
  ------------------
 1152|   172k|        nmv_context *const nmvc = &ec_ctx->nmvc;
 1153|   172k|        read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, nmvc, allow_hp);
 1154|   172k|      }
 1155|  86.4k|      break;
 1156|  86.5k|    }
 1157|   301k|    case NEAREST_NEARESTMV: {
  ------------------
  |  Branch (1157:5): [True: 301k, False: 4.25M]
  ------------------
 1158|   301k|      assert(is_compound);
 1159|   301k|      mv[0].as_int = nearest_mv[0].as_int;
 1160|   301k|      mv[1].as_int = nearest_mv[1].as_int;
 1161|   301k|      break;
 1162|   301k|    }
 1163|  99.2k|    case NEAR_NEARMV: {
  ------------------
  |  Branch (1163:5): [True: 99.2k, False: 4.45M]
  ------------------
 1164|  99.2k|      assert(is_compound);
 1165|  99.3k|      mv[0].as_int = near_mv[0].as_int;
 1166|  99.3k|      mv[1].as_int = near_mv[1].as_int;
 1167|  99.3k|      break;
 1168|  99.2k|    }
 1169|  23.4k|    case NEW_NEARESTMV: {
  ------------------
  |  Branch (1169:5): [True: 23.4k, False: 4.53M]
  ------------------
 1170|  23.4k|      nmv_context *const nmvc = &ec_ctx->nmvc;
 1171|  23.4k|      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp);
 1172|  23.4k|      assert(is_compound);
 1173|  23.4k|      mv[1].as_int = nearest_mv[1].as_int;
 1174|  23.4k|      break;
 1175|  23.4k|    }
 1176|  27.5k|    case NEAREST_NEWMV: {
  ------------------
  |  Branch (1176:5): [True: 27.5k, False: 4.52M]
  ------------------
 1177|  27.5k|      nmv_context *const nmvc = &ec_ctx->nmvc;
 1178|  27.5k|      mv[0].as_int = nearest_mv[0].as_int;
 1179|  27.5k|      read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, nmvc, allow_hp);
 1180|  27.5k|      assert(is_compound);
 1181|  27.5k|      break;
 1182|  27.5k|    }
 1183|  32.5k|    case NEAR_NEWMV: {
  ------------------
  |  Branch (1183:5): [True: 32.5k, False: 4.52M]
  ------------------
 1184|  32.5k|      nmv_context *const nmvc = &ec_ctx->nmvc;
 1185|  32.5k|      mv[0].as_int = near_mv[0].as_int;
 1186|  32.5k|      read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, nmvc, allow_hp);
 1187|  32.5k|      assert(is_compound);
 1188|  32.5k|      break;
 1189|  32.5k|    }
 1190|  32.5k|    case NEW_NEARMV: {
  ------------------
  |  Branch (1190:5): [True: 12.7k, False: 4.54M]
  ------------------
 1191|  12.7k|      nmv_context *const nmvc = &ec_ctx->nmvc;
 1192|  12.7k|      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp);
 1193|  12.7k|      assert(is_compound);
 1194|  12.7k|      mv[1].as_int = near_mv[1].as_int;
 1195|  12.7k|      break;
 1196|  12.7k|    }
 1197|  51.1k|    case GLOBAL_GLOBALMV: {
  ------------------
  |  Branch (1197:5): [True: 51.1k, False: 4.50M]
  ------------------
 1198|  51.1k|      assert(is_compound);
 1199|  51.1k|      mv[0].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[0]],
 1200|  51.1k|                                          features->allow_high_precision_mv,
 1201|  51.1k|                                          bsize, xd->mi_col, xd->mi_row,
 1202|  51.1k|                                          features->cur_frame_force_integer_mv)
 1203|  51.1k|                         .as_int;
 1204|  51.1k|      mv[1].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[1]],
 1205|  51.1k|                                          features->allow_high_precision_mv,
 1206|  51.1k|                                          bsize, xd->mi_col, xd->mi_row,
 1207|  51.1k|                                          features->cur_frame_force_integer_mv)
 1208|  51.1k|                         .as_int;
 1209|  51.1k|      break;
 1210|  51.1k|    }
 1211|      0|    default: {
  ------------------
  |  Branch (1211:5): [True: 0, False: 4.55M]
  ------------------
 1212|      0|      return 0;
 1213|  51.1k|    }
 1214|  4.55M|  }
 1215|       |
 1216|  4.55M|  int ret = is_mv_valid(&mv[0].as_mv);
 1217|  4.55M|  if (is_compound) {
  ------------------
  |  Branch (1217:7): [True: 633k, False: 3.92M]
  ------------------
 1218|   633k|    ret = ret && is_mv_valid(&mv[1].as_mv);
  ------------------
  |  Branch (1218:11): [True: 616k, False: 17.4k]
  |  Branch (1218:18): [True: 610k, False: 5.71k]
  ------------------
 1219|   633k|  }
 1220|  4.55M|  return ret;
 1221|  4.55M|}
decodemv.c:read_interintra_mode:
  170|   339k|                                            int size_group) {
  171|   339k|  const INTERINTRA_MODE ii_mode = (INTERINTRA_MODE)aom_read_symbol(
  ------------------
  |  |   51|   339k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  172|   339k|      r, xd->tile_ctx->interintra_mode_cdf[size_group], INTERINTRA_MODES,
  173|   339k|      ACCT_STR);
  174|   339k|  return ii_mode;
  175|   339k|}
decodemv.c:read_motion_mode:
  228|  4.21M|                                    MB_MODE_INFO *mbmi, aom_reader *r) {
  229|  4.21M|  if (cm->features.switchable_motion_mode == 0) return SIMPLE_TRANSLATION;
  ------------------
  |  Branch (229:7): [True: 504k, False: 3.71M]
  ------------------
  230|  3.71M|  if (mbmi->skip_mode) return SIMPLE_TRANSLATION;
  ------------------
  |  Branch (230:7): [True: 34.9k, False: 3.67M]
  ------------------
  231|       |
  232|  3.67M|  const MOTION_MODE last_motion_mode_allowed = motion_mode_allowed(
  233|  3.67M|      xd->global_motion, xd, mbmi, cm->features.allow_warped_motion);
  234|  3.67M|  int motion_mode;
  235|       |
  236|  3.67M|  if (last_motion_mode_allowed == SIMPLE_TRANSLATION) return SIMPLE_TRANSLATION;
  ------------------
  |  Branch (236:7): [True: 1.91M, False: 1.76M]
  ------------------
  237|       |
  238|  1.76M|  if (last_motion_mode_allowed == OBMC_CAUSAL) {
  ------------------
  |  Branch (238:7): [True: 401k, False: 1.36M]
  ------------------
  239|   401k|    motion_mode =
  240|   401k|        aom_read_symbol(r, xd->tile_ctx->obmc_cdf[mbmi->bsize], 2, ACCT_STR);
  ------------------
  |  |   51|   401k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  241|   401k|    return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
  242|  1.36M|  } else {
  243|  1.36M|    motion_mode = aom_read_symbol(r, xd->tile_ctx->motion_mode_cdf[mbmi->bsize],
  ------------------
  |  |   51|  1.36M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  244|  1.36M|                                  MOTION_MODES, ACCT_STR);
  245|  1.36M|    return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
  246|  1.36M|  }
  247|  1.76M|}
decodemv.c:read_mb_interp_filter:
 1038|  4.55M|                                         aom_reader *r) {
 1039|  4.55M|  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 1040|       |
 1041|  4.55M|  if (!av1_is_interp_needed(xd)) {
  ------------------
  |  Branch (1041:7): [True: 897k, False: 3.65M]
  ------------------
 1042|   897k|    set_default_interp_filters(mbmi, interp_filter);
 1043|   897k|    return;
 1044|   897k|  }
 1045|       |
 1046|  3.65M|  if (interp_filter != SWITCHABLE) {
  ------------------
  |  Branch (1046:7): [True: 658k, False: 2.99M]
  ------------------
 1047|   658k|    mbmi->interp_filters = av1_broadcast_interp_filter(interp_filter);
 1048|  2.99M|  } else {
 1049|  2.99M|    InterpFilter ref0_filter[2] = { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR };
 1050|  3.20M|    for (int dir = 0; dir < 2; ++dir) {
  ------------------
  |  Branch (1050:23): [True: 3.10M, False: 105k]
  ------------------
 1051|  3.10M|      const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
 1052|  3.10M|      ref0_filter[dir] = (InterpFilter)aom_read_symbol(
  ------------------
  |  |   51|  3.10M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
 1053|  3.10M|          r, ec_ctx->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS, ACCT_STR);
 1054|  3.10M|      if (!enable_dual_filter) {
  ------------------
  |  Branch (1054:11): [True: 2.89M, False: 210k]
  ------------------
 1055|  2.89M|        ref0_filter[1] = ref0_filter[0];
 1056|  2.89M|        break;
 1057|  2.89M|      }
 1058|  3.10M|    }
 1059|       |    // The index system works as: (0, 1) -> (vertical, horizontal) filter types
 1060|  2.99M|    mbmi->interp_filters.as_filters.x_filter = ref0_filter[1];
 1061|  2.99M|    mbmi->interp_filters.as_filters.y_filter = ref0_filter[0];
 1062|  2.99M|  }
 1063|  3.65M|}
decodemv.c:read_intra_block_mode_info:
 1068|  1.80M|                                       aom_reader *r) {
 1069|  1.80M|  const BLOCK_SIZE bsize = mbmi->bsize;
 1070|  1.80M|  const int use_angle_delta = av1_use_angle_delta(bsize);
 1071|       |
 1072|  1.80M|  mbmi->ref_frame[0] = INTRA_FRAME;
 1073|  1.80M|  mbmi->ref_frame[1] = NONE_FRAME;
 1074|       |
 1075|  1.80M|  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 1076|       |
 1077|  1.80M|  mbmi->mode = read_intra_mode(r, ec_ctx->y_mode_cdf[size_group_lookup[bsize]]);
 1078|       |
 1079|  1.80M|  mbmi->angle_delta[PLANE_TYPE_Y] =
 1080|  1.80M|      use_angle_delta && av1_is_directional_mode(mbmi->mode)
  ------------------
  |  Branch (1080:7): [True: 1.47M, False: 329k]
  |  Branch (1080:26): [True: 306k, False: 1.16M]
  ------------------
 1081|  1.80M|          ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED])
 1082|  1.80M|          : 0;
 1083|  1.80M|  if (!cm->seq_params->monochrome && xd->is_chroma_ref) {
  ------------------
  |  Branch (1083:7): [True: 1.79M, False: 10.5k]
  |  Branch (1083:38): [True: 1.46M, False: 333k]
  ------------------
 1084|  1.46M|    mbmi->uv_mode =
 1085|  1.46M|        read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode);
 1086|  1.46M|    if (mbmi->uv_mode == UV_CFL_PRED) {
  ------------------
  |  Branch (1086:9): [True: 313k, False: 1.14M]
  ------------------
 1087|   313k|      mbmi->cfl_alpha_idx =
 1088|   313k|          read_cfl_alphas(xd->tile_ctx, r, &mbmi->cfl_alpha_signs);
 1089|   313k|    }
 1090|  1.46M|    const PREDICTION_MODE intra_mode = get_uv_mode(mbmi->uv_mode);
 1091|  1.46M|    mbmi->angle_delta[PLANE_TYPE_UV] =
 1092|  1.46M|        use_angle_delta && av1_is_directional_mode(intra_mode)
  ------------------
  |  Branch (1092:9): [True: 1.31M, False: 143k]
  |  Branch (1092:28): [True: 301k, False: 1.01M]
  ------------------
 1093|  1.46M|            ? read_angle_delta(r, ec_ctx->angle_delta_cdf[intra_mode - V_PRED])
 1094|  1.46M|            : 0;
 1095|  1.46M|  } else {
 1096|       |    // Avoid decoding angle_info if there is no chroma prediction
 1097|   343k|    mbmi->uv_mode = UV_DC_PRED;
 1098|   343k|  }
 1099|  1.80M|  xd->cfl.store_y = store_cfl_required(cm, xd);
 1100|       |
 1101|  1.80M|  mbmi->palette_mode_info.palette_size[0] = 0;
 1102|  1.80M|  mbmi->palette_mode_info.palette_size[1] = 0;
 1103|  1.80M|  if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize))
  ------------------
  |  Branch (1103:7): [True: 42.7k, False: 1.76M]
  ------------------
 1104|  42.7k|    read_palette_mode_info(cm, xd, r);
 1105|       |
 1106|  1.80M|  read_filter_intra_mode_info(cm, xd, r);
 1107|  1.80M|}

av1_decoder_create:
   90|  16.1k|AV1Decoder *av1_decoder_create(BufferPool *const pool) {
   91|  16.1k|  AV1Decoder *volatile const pbi = aom_memalign(32, sizeof(*pbi));
   92|  16.1k|  if (!pbi) return NULL;
  ------------------
  |  Branch (92:7): [True: 0, False: 16.1k]
  ------------------
   93|  16.1k|  av1_zero(*pbi);
  ------------------
  |  |   43|  16.1k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
   94|       |
   95|  16.1k|  AV1_COMMON *volatile const cm = &pbi->common;
   96|  16.1k|  cm->seq_params = &pbi->seq_params;
   97|  16.1k|  cm->error = &pbi->error;
   98|       |
   99|       |  // The jmp_buf is valid only for the duration of the function that calls
  100|       |  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
  101|       |  // before it returns.
  102|  16.1k|  if (setjmp(pbi->error.jmp)) {
  103|      0|    pbi->error.setjmp = 0;
  104|      0|    av1_decoder_remove(pbi);
  105|      0|    return NULL;
  106|      0|  }
  107|       |
  108|  16.1k|  pbi->error.setjmp = 1;
  109|       |
  110|  16.1k|  CHECK_MEM_ERROR(cm, cm->fc,
  ------------------
  |  |   51|  16.1k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  16.1k|  do {                                                    \
  |  |  |  |   69|  16.1k|    lval = (expr);                                        \
  |  |  |  |   70|  16.1k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 16.1k]
  |  |  |  |  ------------------
  |  |  |  |   71|  16.1k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  16.1k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  111|  16.1k|                  (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
  112|  16.1k|  CHECK_MEM_ERROR(
  ------------------
  |  |   51|  16.1k|  AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
  |  |  ------------------
  |  |  |  |   68|  16.1k|  do {                                                    \
  |  |  |  |   69|  16.1k|    lval = (expr);                                        \
  |  |  |  |   70|  16.1k|    if (!lval)                                            \
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (70:9): [True: 0, False: 16.1k]
  |  |  |  |  ------------------
  |  |  |  |   71|  16.1k|      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
  |  |  |  |   72|      0|                         "Failed to allocate " #lval);    \
  |  |  |  |   73|  16.1k|  } while (0)
  |  |  |  |  ------------------
  |  |  |  |  |  Branch (73:12): [Folded - Ignored]
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
  113|  16.1k|      cm, cm->default_frame_context,
  114|  16.1k|      (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->default_frame_context)));
  115|  16.1k|  memset(cm->fc, 0, sizeof(*cm->fc));
  116|  16.1k|  memset(cm->default_frame_context, 0, sizeof(*cm->default_frame_context));
  117|       |
  118|  16.1k|  pbi->need_resync = 1;
  119|  16.1k|  initialize_dec();
  120|       |
  121|       |  // Initialize the references to not point to any frame buffers.
  122|   144k|  for (int i = 0; i < REF_FRAMES; i++) {
  ------------------
  |  Branch (122:19): [True: 128k, False: 16.1k]
  ------------------
  123|   128k|    cm->ref_frame_map[i] = NULL;
  124|   128k|  }
  125|       |
  126|  16.1k|  cm->current_frame.frame_number = 0;
  127|  16.1k|  pbi->decoding_first_frame = 1;
  128|  16.1k|  pbi->common.buffer_pool = pool;
  129|       |
  130|  16.1k|  cm->seq_params->bit_depth = AOM_BITS_8;
  131|       |
  132|  16.1k|  cm->mi_params.free_mi = dec_free_mi;
  133|  16.1k|  cm->mi_params.setup_mi = dec_setup_mi;
  134|  16.1k|  cm->mi_params.set_mb_mi = dec_set_mb_mi;
  135|       |
  136|  16.1k|  av1_loop_filter_init(cm);
  137|       |
  138|  16.1k|  av1_qm_init(&cm->quant_params, av1_num_planes(cm));
  139|  16.1k|  av1_loop_restoration_precal();
  140|       |
  141|       |#if CONFIG_ACCOUNTING
  142|       |  pbi->acct_enabled = 1;
  143|       |  aom_accounting_init(&pbi->accounting);
  144|       |#endif
  145|       |
  146|  16.1k|  pbi->error.setjmp = 0;
  147|       |
  148|  16.1k|  aom_get_worker_interface()->init(&pbi->lf_worker);
  149|  16.1k|  pbi->lf_worker.thread_name = "aom lf worker";
  150|       |
  151|  16.1k|  return pbi;
  152|  16.1k|}
av1_dealloc_dec_jobs:
  154|  7.64k|void av1_dealloc_dec_jobs(struct AV1DecTileMTData *tile_mt_info) {
  155|  7.64k|  if (tile_mt_info != NULL) {
  ------------------
  |  Branch (155:7): [True: 7.64k, False: 0]
  ------------------
  156|  7.64k|#if CONFIG_MULTITHREAD
  157|  7.64k|    if (tile_mt_info->job_mutex != NULL) {
  ------------------
  |  Branch (157:9): [True: 4.12k, False: 3.52k]
  ------------------
  158|  4.12k|      pthread_mutex_destroy(tile_mt_info->job_mutex);
  159|  4.12k|      aom_free(tile_mt_info->job_mutex);
  160|  4.12k|    }
  161|  7.64k|#endif
  162|  7.64k|    aom_free(tile_mt_info->job_queue);
  163|       |    // clear the structure as the source of this call may be a resize in which
  164|       |    // case this call will be followed by an _alloc() which may fail.
  165|  7.64k|    av1_zero(*tile_mt_info);
  ------------------
  |  |   43|  7.64k|#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
  ------------------
  166|  7.64k|  }
  167|  7.64k|}
av1_dec_free_cb_buf:
  169|  20.4k|void av1_dec_free_cb_buf(AV1Decoder *pbi) {
  170|  20.4k|  aom_free(pbi->cb_buffer_base);
  171|  20.4k|  pbi->cb_buffer_base = NULL;
  172|  20.4k|  pbi->cb_buffer_alloc_size = 0;
  173|  20.4k|}
av1_decoder_remove:
  175|  16.1k|void av1_decoder_remove(AV1Decoder *pbi) {
  176|  16.1k|  int i;
  177|       |
  178|  16.1k|  if (!pbi) return;
  ------------------
  |  Branch (178:7): [True: 0, False: 16.1k]
  ------------------
  179|       |
  180|       |  // Free the tile list output buffer.
  181|  16.1k|  aom_free_frame_buffer(&pbi->tile_list_outbuf);
  182|       |
  183|  16.1k|  aom_get_worker_interface()->end(&pbi->lf_worker);
  184|  16.1k|  aom_free(pbi->lf_worker.data1);
  185|       |
  186|  16.1k|  if (pbi->thread_data) {
  ------------------
  |  Branch (186:7): [True: 3.52k, False: 12.5k]
  ------------------
  187|   129k|    for (int worker_idx = 1; worker_idx < pbi->num_workers; worker_idx++) {
  ------------------
  |  Branch (187:30): [True: 125k, False: 3.52k]
  ------------------
  188|   125k|      DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
  189|   125k|      if (thread_data->td != NULL) {
  ------------------
  |  Branch (189:11): [True: 125k, False: 0]
  ------------------
  190|   125k|        av1_free_mc_tmp_buf(thread_data->td);
  191|   125k|        aom_free(thread_data->td);
  192|   125k|      }
  193|   125k|    }
  194|  3.52k|    aom_free(pbi->thread_data);
  195|  3.52k|  }
  196|  16.1k|  aom_free(pbi->dcb.xd.seg_mask);
  197|       |
  198|   145k|  for (i = 0; i < pbi->num_workers; ++i) {
  ------------------
  |  Branch (198:15): [True: 129k, False: 16.1k]
  ------------------
  199|   129k|    AVxWorker *const worker = &pbi->tile_workers[i];
  200|   129k|    aom_get_worker_interface()->end(worker);
  201|   129k|  }
  202|  16.1k|#if CONFIG_MULTITHREAD
  203|  16.1k|  if (pbi->row_mt_mutex_ != NULL) {
  ------------------
  |  Branch (203:7): [True: 3.32k, False: 12.7k]
  ------------------
  204|  3.32k|    pthread_mutex_destroy(pbi->row_mt_mutex_);
  205|  3.32k|    aom_free(pbi->row_mt_mutex_);
  206|  3.32k|  }
  207|  16.1k|  if (pbi->row_mt_cond_ != NULL) {
  ------------------
  |  Branch (207:7): [True: 3.32k, False: 12.7k]
  ------------------
  208|  3.32k|    pthread_cond_destroy(pbi->row_mt_cond_);
  209|  3.32k|    aom_free(pbi->row_mt_cond_);
  210|  3.32k|  }
  211|  16.1k|#endif
  212|  47.7k|  for (i = 0; i < pbi->allocated_tiles; i++) {
  ------------------
  |  Branch (212:15): [True: 31.6k, False: 16.1k]
  ------------------
  213|  31.6k|    TileDataDec *const tile_data = pbi->tile_data + i;
  214|  31.6k|    av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync);
  215|  31.6k|  }
  216|  16.1k|  aom_free(pbi->tile_data);
  217|  16.1k|  aom_free(pbi->tile_workers);
  218|       |
  219|  16.1k|  if (pbi->num_workers > 0) {
  ------------------
  |  Branch (219:7): [True: 3.52k, False: 12.5k]
  ------------------
  220|  3.52k|    av1_loop_filter_dealloc(&pbi->lf_row_sync);
  221|  3.52k|    av1_loop_restoration_dealloc(&pbi->lr_row_sync);
  222|  3.52k|    av1_dealloc_dec_jobs(&pbi->tile_mt_info);
  223|  3.52k|  }
  224|       |
  225|  16.1k|  av1_dec_free_cb_buf(pbi);
  226|       |#if CONFIG_ACCOUNTING
  227|       |  aom_accounting_clear(&pbi->accounting);
  228|       |#endif
  229|  16.1k|  av1_free_mc_tmp_buf(&pbi->td);
  230|  16.1k|  aom_img_metadata_array_free(pbi->metadata);
  231|  16.1k|  av1_remove_common(&pbi->common);
  232|  16.1k|  aom_free(pbi);
  233|  16.1k|}
av1_visit_palette:
  236|  38.3M|                       aom_reader *r, palette_visitor_fn_t visit) {
  237|  38.3M|  if (!is_inter_block(xd->mi[0])) {
  ------------------
  |  Branch (237:7): [True: 25.8M, False: 12.4M]
  ------------------
  238|  76.2M|    for (int plane = 0; plane < AOMMIN(2, av1_num_planes(&pbi->common));
  ------------------
  |  |   34|  76.2M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 73.6M, False: 2.58M]
  |  |  ------------------
  ------------------
  |  Branch (238:25): [True: 50.3M, False: 25.8M]
  ------------------
  239|  50.3M|         ++plane) {
  240|  50.3M|      if (plane == 0 || xd->is_chroma_ref) {
  ------------------
  |  Branch (240:11): [True: 25.8M, False: 24.5M]
  |  Branch (240:25): [True: 23.1M, False: 1.40M]
  ------------------
  241|  48.9M|        if (xd->mi[0]->palette_mode_info.palette_size[plane])
  ------------------
  |  Branch (241:13): [True: 324k, False: 48.6M]
  ------------------
  242|   324k|          visit(xd, plane, r);
  243|  48.9M|      } else {
  244|  1.40M|        assert(xd->mi[0]->palette_mode_info.palette_size[plane] == 0);
  245|  1.40M|      }
  246|  50.3M|    }
  247|  25.8M|  }
  248|  38.3M|}
av1_set_reference_dec:
  287|    357|                                      YV12_BUFFER_CONFIG *sd) {
  288|    357|  const int num_planes = av1_num_planes(cm);
  289|    357|  YV12_BUFFER_CONFIG *ref_buf = NULL;
  290|       |
  291|       |  // Get the destination reference buffer.
  292|    357|  ref_buf = get_ref_frame(cm, idx);
  293|       |
  294|    357|  if (ref_buf == NULL) {
  ------------------
  |  Branch (294:7): [True: 316, False: 41]
  ------------------
  295|    316|    aom_internal_error(cm->error, AOM_CODEC_ERROR, "No reference frame");
  296|    316|    return AOM_CODEC_ERROR;
  297|    316|  }
  298|       |
  299|     41|  if (!use_external_ref) {
  ------------------
  |  Branch (299:7): [True: 0, False: 41]
  ------------------
  300|      0|    if (!equal_dimensions(ref_buf, sd)) {
  ------------------
  |  Branch (300:9): [True: 0, False: 0]
  ------------------
  301|      0|      aom_internal_error(cm->error, AOM_CODEC_ERROR,
  302|      0|                         "Incorrect buffer dimensions");
  303|      0|    } else {
  304|       |      // Overwrite the reference frame buffer.
  305|      0|      aom_yv12_copy_frame(sd, ref_buf, num_planes);
  ------------------
  |  |   37|      0|#define aom_yv12_copy_frame aom_yv12_copy_frame_c
  ------------------
  306|      0|    }
  307|     41|  } else {
  308|     41|    if (!equal_dimensions_and_border(ref_buf, sd)) {
  ------------------
  |  Branch (308:9): [True: 41, False: 0]
  ------------------
  309|     41|      aom_internal_error(cm->error, AOM_CODEC_ERROR,
  310|     41|                         "Incorrect buffer dimensions");
  311|     41|    } else {
  312|       |      // Overwrite the reference frame buffer pointers.
  313|       |      // Once we no longer need the external reference buffer, these pointers
  314|       |      // are restored.
  315|      0|      ref_buf->store_buf_adr[0] = ref_buf->y_buffer;
  316|      0|      ref_buf->store_buf_adr[1] = ref_buf->u_buffer;
  317|      0|      ref_buf->store_buf_adr[2] = ref_buf->v_buffer;
  318|      0|      ref_buf->y_buffer = sd->y_buffer;
  319|      0|      ref_buf->u_buffer = sd->u_buffer;
  320|      0|      ref_buf->v_buffer = sd->v_buffer;
  321|      0|      ref_buf->use_external_reference_buffers = 1;
  322|      0|    }
  323|     41|  }
  324|       |
  325|     41|  return cm->error->error_code;
  326|    357|}
av1_receive_compressed_data:
  426|   311k|                                const uint8_t **psource) {
  427|   311k|  AV1_COMMON *volatile const cm = &pbi->common;
  428|   311k|  const uint8_t *source = *psource;
  429|   311k|  pbi->error.error_code = AOM_CODEC_OK;
  430|   311k|  pbi->error.has_detail = 0;
  431|       |
  432|   311k|  if (size == 0) {
  ------------------
  |  Branch (432:7): [True: 362, False: 311k]
  ------------------
  433|       |    // This is used to signal that we are missing frames.
  434|       |    // We do not know if the missing frame(s) was supposed to update
  435|       |    // any of the reference buffers, but we act conservative and
  436|       |    // mark only the last buffer as corrupted.
  437|       |    //
  438|       |    // TODO(jkoleszar): Error concealment is undefined and non-normative
  439|       |    // at this point, but if it becomes so, [0] may not always be the correct
  440|       |    // thing to do here.
  441|    362|    RefCntBuffer *ref_buf = get_ref_frame_buf(cm, LAST_FRAME);
  442|    362|    if (ref_buf != NULL) ref_buf->buf.corrupted = 1;
  ------------------
  |  Branch (442:9): [True: 0, False: 362]
  ------------------
  443|    362|  }
  444|       |
  445|   311k|  if (assign_cur_frame_new_fb(cm) == NULL) {
  ------------------
  |  Branch (445:7): [True: 0, False: 311k]
  ------------------
  446|      0|    pbi->error.error_code = AOM_CODEC_MEM_ERROR;
  447|      0|    return 1;
  448|      0|  }
  449|       |
  450|       |  // The jmp_buf is valid only for the duration of the function that calls
  451|       |  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
  452|       |  // before it returns.
  453|   311k|  if (setjmp(pbi->error.jmp)) {
  454|   145k|    const AVxWorkerInterface *const winterface = aom_get_worker_interface();
  455|   145k|    int i;
  456|       |
  457|   145k|    pbi->error.setjmp = 0;
  458|       |
  459|       |    // Synchronize all threads immediately as a subsequent decode call may
  460|       |    // cause a resize invalidating some allocations.
  461|   145k|    winterface->sync(&pbi->lf_worker);
  462|  1.84M|    for (i = 0; i < pbi->num_workers; ++i) {
  ------------------
  |  Branch (462:17): [True: 1.69M, False: 145k]
  ------------------
  463|  1.69M|      winterface->sync(&pbi->tile_workers[i]);
  464|  1.69M|    }
  465|       |
  466|   145k|    release_current_frame(pbi);
  467|   145k|    return -1;
  468|   145k|  }
  469|       |
  470|   166k|  pbi->error.setjmp = 1;
  471|       |
  472|   166k|  int frame_decoded =
  473|   166k|      aom_decode_frame_from_obus(pbi, source, source + size, psource);
  474|       |
  475|   166k|  if (frame_decoded < 0) {
  ------------------
  |  Branch (475:7): [True: 60.6k, False: 105k]
  ------------------
  476|  60.6k|    assert(pbi->error.error_code != AOM_CODEC_OK);
  477|  60.6k|    release_current_frame(pbi);
  478|  60.6k|    pbi->error.setjmp = 0;
  479|  60.6k|    return 1;
  480|  60.6k|  }
  481|       |
  482|       |#if TXCOEFF_TIMER
  483|       |  cm->cum_txcoeff_timer += cm->txcoeff_timer;
  484|       |  fprintf(stderr,
  485|       |          "txb coeff block number: %d, frame time: %ld, cum time %ld in us\n",
  486|       |          cm->txb_count, cm->txcoeff_timer, cm->cum_txcoeff_timer);
  487|       |  cm->txcoeff_timer = 0;
  488|       |  cm->txb_count = 0;
  489|       |#endif
  490|       |
  491|       |  // Note: At this point, this function holds a reference to cm->cur_frame
  492|       |  // in the buffer pool. This reference is consumed by update_frame_buffers().
  493|   105k|  update_frame_buffers(pbi, frame_decoded);
  494|       |
  495|   105k|  if (frame_decoded) {
  ------------------
  |  Branch (495:7): [True: 105k, False: 697]
  ------------------
  496|   105k|    pbi->decoding_first_frame = 0;
  497|   105k|  }
  498|       |
  499|   105k|  if (pbi->error.error_code != AOM_CODEC_OK) {
  ------------------
  |  Branch (499:7): [True: 5, False: 105k]
  ------------------
  500|      5|    pbi->error.setjmp = 0;
  501|      5|    return 1;
  502|      5|  }
  503|       |
  504|   105k|  if (!cm->show_existing_frame) {
  ------------------
  |  Branch (504:7): [True: 105k, False: 183]
  ------------------
  505|   105k|    if (cm->seg.enabled) {
  ------------------
  |  Branch (505:9): [True: 15.2k, False: 90.5k]
  ------------------
  506|  15.2k|      if (cm->prev_frame &&
  ------------------
  |  Branch (506:11): [True: 8.09k, False: 7.12k]
  ------------------
  507|  15.2k|          (cm->mi_params.mi_rows == cm->prev_frame->mi_rows) &&
  ------------------
  |  Branch (507:11): [True: 7.01k, False: 1.08k]
  ------------------
  508|  15.2k|          (cm->mi_params.mi_cols == cm->prev_frame->mi_cols)) {
  ------------------
  |  Branch (508:11): [True: 7.00k, False: 5]
  ------------------
  509|  7.00k|        cm->last_frame_seg_map = cm->prev_frame->seg_map;
  510|  8.20k|      } else {
  511|  8.20k|        cm->last_frame_seg_map = NULL;
  512|  8.20k|      }
  513|  15.2k|    }
  514|   105k|  }
  515|       |
  516|       |  // Update progress in frame parallel decode.
  517|   105k|  pbi->error.setjmp = 0;
  518|       |
  519|   105k|  return 0;
  520|   105k|}
av1_get_raw_frame:
  524|   135k|                      aom_film_grain_t **grain_params) {
  525|   135k|  if (index >= pbi->num_output_frames) return -1;
  ------------------
  |  Branch (525:7): [True: 73.3k, False: 62.4k]
  ------------------
  526|  62.4k|  *sd = &pbi->output_frames[index]->buf;
  527|  62.4k|  *grain_params = &pbi->output_frames[index]->film_grain_params;
  528|  62.4k|  return 0;
  529|   135k|}
decoder.c:initialize_dec:
   38|  16.1k|static void initialize_dec(void) {
   39|  16.1k|  av1_rtcd();
   40|  16.1k|  aom_dsp_rtcd();
   41|  16.1k|  aom_scale_rtcd();
   42|  16.1k|  av1_init_intra_predictors();
   43|  16.1k|  av1_init_wedge_masks();
   44|  16.1k|}
decoder.c:dec_free_mi:
   79|  51.7k|static void dec_free_mi(CommonModeInfoParams *mi_params) {
   80|  51.7k|  aom_free(mi_params->mi_alloc);
   81|  51.7k|  mi_params->mi_alloc = NULL;
   82|  51.7k|  mi_params->mi_alloc_size = 0;
   83|  51.7k|  aom_free(mi_params->mi_grid_base);
   84|  51.7k|  mi_params->mi_grid_base = NULL;
   85|  51.7k|  mi_params->mi_grid_size = 0;
   86|  51.7k|  aom_free(mi_params->tx_type_map);
   87|  51.7k|  mi_params->tx_type_map = NULL;
   88|  51.7k|}
decoder.c:dec_setup_mi:
   72|   271k|static void dec_setup_mi(CommonModeInfoParams *mi_params) {
   73|   271k|  const int mi_grid_size =
   74|   271k|      mi_params->mi_stride * calc_mi_size(mi_params->mi_rows);
   75|   271k|  memset(mi_params->mi_grid_base, 0,
   76|   271k|         mi_grid_size * sizeof(*mi_params->mi_grid_base));
   77|   271k|}
decoder.c:dec_set_mb_mi:
   47|  95.9k|                          int height, BLOCK_SIZE min_partition_size) {
   48|  95.9k|  (void)min_partition_size;
   49|       |  // Ensure that the decoded width and height are both multiples of
   50|       |  // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if
   51|       |  // subsampling is used).
   52|       |  // This simplifies the implementation of various experiments,
   53|       |  // eg. cdef, which operates on units of 8x8 luma pixels.
   54|  95.9k|  const int aligned_width = ALIGN_POWER_OF_TWO(width, 3);
  ------------------
  |  |   69|  95.9k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
   55|  95.9k|  const int aligned_height = ALIGN_POWER_OF_TWO(height, 3);
  ------------------
  |  |   69|  95.9k|  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
  ------------------
   56|       |
   57|  95.9k|  mi_params->mi_cols = aligned_width >> MI_SIZE_LOG2;
  ------------------
  |  |   39|  95.9k|#define MI_SIZE_LOG2 2
  ------------------
   58|  95.9k|  mi_params->mi_rows = aligned_height >> MI_SIZE_LOG2;
  ------------------
  |  |   39|  95.9k|#define MI_SIZE_LOG2 2
  ------------------
   59|  95.9k|  mi_params->mi_stride = calc_mi_size(mi_params->mi_cols);
   60|       |
   61|  95.9k|  mi_params->mb_cols = ROUND_POWER_OF_TWO(mi_params->mi_cols, 2);
  ------------------
  |  |   41|  95.9k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
   62|  95.9k|  mi_params->mb_rows = ROUND_POWER_OF_TWO(mi_params->mi_rows, 2);
  ------------------
  |  |   41|  95.9k|#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
  ------------------
   63|  95.9k|  mi_params->MBs = mi_params->mb_rows * mi_params->mb_cols;
   64|       |
   65|  95.9k|  mi_params->mi_alloc_bsize = BLOCK_4X4;
   66|  95.9k|  mi_params->mi_alloc_stride = mi_params->mi_stride;
   67|       |
   68|  95.9k|  assert(mi_size_wide[mi_params->mi_alloc_bsize] ==
   69|  95.9k|         mi_size_high[mi_params->mi_alloc_bsize]);
   70|  95.9k|}
decoder.c:equal_dimensions_and_border:
  276|     41|                                       const YV12_BUFFER_CONFIG *b) {
  277|     41|  return a->y_height == b->y_height && a->y_width == b->y_width &&
  ------------------
  |  Branch (277:10): [True: 0, False: 41]
  |  Branch (277:40): [True: 0, False: 0]
  ------------------
  278|     41|         a->uv_height == b->uv_height && a->uv_width == b->uv_width &&
  ------------------
  |  Branch (278:10): [True: 0, False: 0]
  |  Branch (278:42): [True: 0, False: 0]
  ------------------
  279|     41|         a->y_stride == b->y_stride && a->uv_stride == b->uv_stride &&
  ------------------
  |  Branch (279:10): [True: 0, False: 0]
  |  Branch (279:40): [True: 0, False: 0]
  ------------------
  280|     41|         a->border == b->border &&
  ------------------
  |  Branch (280:10): [True: 0, False: 0]
  ------------------
  281|     41|         (a->flags & YV12_FLAG_HIGHBITDEPTH) ==
  ------------------
  |  |  142|      0|#define YV12_FLAG_HIGHBITDEPTH 8
  ------------------
  |  Branch (281:10): [True: 0, False: 0]
  ------------------
  282|      0|             (b->flags & YV12_FLAG_HIGHBITDEPTH);
  ------------------
  |  |  142|      0|#define YV12_FLAG_HIGHBITDEPTH 8
  ------------------
  283|     41|}
decoder.c:release_current_frame:
  342|   205k|static void release_current_frame(AV1Decoder *pbi) {
  343|   205k|  AV1_COMMON *const cm = &pbi->common;
  344|   205k|  BufferPool *const pool = cm->buffer_pool;
  345|       |
  346|   205k|  cm->cur_frame->buf.corrupted = 1;
  347|   205k|  lock_buffer_pool(pool);
  348|   205k|  decrease_ref_count(cm->cur_frame, pool);
  349|   205k|  unlock_buffer_pool(pool);
  350|   205k|  cm->cur_frame = NULL;
  351|   205k|}
decoder.c:update_frame_buffers:
  358|   105k|static void update_frame_buffers(AV1Decoder *pbi, int frame_decoded) {
  359|   105k|  int ref_index = 0, mask;
  360|   105k|  AV1_COMMON *const cm = &pbi->common;
  361|   105k|  BufferPool *const pool = cm->buffer_pool;
  362|       |
  363|   105k|  if (frame_decoded) {
  ------------------
  |  Branch (363:7): [True: 105k, False: 697]
  ------------------
  364|   105k|    lock_buffer_pool(pool);
  365|       |
  366|       |    // In ext-tile decoding, the camera frame header is only decoded once. So,
  367|       |    // we don't update the references here.
  368|   105k|    if (!pbi->camera_frame_header_ready) {
  ------------------
  |  Branch (368:9): [True: 91.2k, False: 14.0k]
  ------------------
  369|       |      // The following for loop needs to release the reference stored in
  370|       |      // cm->ref_frame_map[ref_index] before storing a reference to
  371|       |      // cm->cur_frame in cm->ref_frame_map[ref_index].
  372|   637k|      for (mask = cm->current_frame.refresh_frame_flags; mask; mask >>= 1) {
  ------------------
  |  Branch (372:58): [True: 546k, False: 91.2k]
  ------------------
  373|   546k|        if (mask & 1) {
  ------------------
  |  Branch (373:13): [True: 395k, False: 151k]
  ------------------
  374|   395k|          decrease_ref_count(cm->ref_frame_map[ref_index], pool);
  375|   395k|          cm->ref_frame_map[ref_index] = cm->cur_frame;
  376|   395k|          ++cm->cur_frame->ref_count;
  377|   395k|        }
  378|   546k|        ++ref_index;
  379|   546k|      }
  380|  91.2k|    }
  381|       |
  382|   105k|    if (cm->show_existing_frame || cm->show_frame) {
  ------------------
  |  Branch (382:9): [True: 143, False: 105k]
  |  Branch (382:36): [True: 81.2k, False: 23.8k]
  ------------------
  383|  81.3k|      if (pbi->output_all_layers) {
  ------------------
  |  Branch (383:11): [True: 18.0k, False: 63.3k]
  ------------------
  384|       |        // Append this frame to the output queue
  385|  18.0k|        if (pbi->num_output_frames >= MAX_NUM_SPATIAL_LAYERS) {
  ------------------
  |  |   71|  18.0k|#define MAX_NUM_SPATIAL_LAYERS 4
  ------------------
  |  Branch (385:13): [True: 5, False: 18.0k]
  ------------------
  386|       |          // We can't store the new frame anywhere, so drop it and return an
  387|       |          // error
  388|      5|          cm->cur_frame->buf.corrupted = 1;
  389|      5|          decrease_ref_count(cm->cur_frame, pool);
  390|      5|          pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
  391|  18.0k|        } else {
  392|  18.0k|          pbi->output_frames[pbi->num_output_frames] = cm->cur_frame;
  393|  18.0k|          pbi->num_output_frames++;
  394|  18.0k|        }
  395|  63.3k|      } else {
  396|       |        // Replace any existing output frame
  397|  63.3k|        assert(pbi->num_output_frames == 0 || pbi->num_output_frames == 1);
  398|  63.3k|        if (pbi->num_output_frames > 0) {
  ------------------
  |  Branch (398:13): [True: 15.5k, False: 47.8k]
  ------------------
  399|  15.5k|          decrease_ref_count(pbi->output_frames[0], pool);
  400|  15.5k|        }
  401|  63.3k|        pbi->output_frames[0] = cm->cur_frame;
  402|  63.3k|        pbi->num_output_frames = 1;
  403|  63.3k|      }
  404|  81.3k|    } else {
  405|  23.8k|      decrease_ref_count(cm->cur_frame, pool);
  406|  23.8k|    }
  407|       |
  408|   105k|    unlock_buffer_pool(pool);
  409|   105k|  } else {
  410|       |    // Nothing was decoded, so just drop this frame buffer
  411|    697|    lock_buffer_pool(pool);
  412|    697|    decrease_ref_count(cm->cur_frame, pool);
  413|    697|    unlock_buffer_pool(pool);
  414|    697|  }
  415|   105k|  cm->cur_frame = NULL;
  416|       |
  417|   105k|  if (!pbi->camera_frame_header_ready) {
  ------------------
  |  Branch (417:7): [True: 91.9k, False: 14.0k]
  ------------------
  418|       |    // Invalidate these references until the next frame starts.
  419|   735k|    for (ref_index = 0; ref_index < INTER_REFS_PER_FRAME; ref_index++) {
  ------------------
  |  Branch (419:25): [True: 643k, False: 91.9k]
  ------------------
  420|   643k|      cm->remapped_ref_idx[ref_index] = INVALID_IDX;
  ------------------
  |  |   15|   643k|#define INVALID_IDX -1  // Invalid buffer index.
  ------------------
  421|   643k|    }
  422|  91.9k|  }
  423|   105k|}

av1_dx_iface.c:decrease_ref_count:
  405|  62.6k|                                      BufferPool *const pool) {
  406|  62.6k|  if (buf != NULL) {
  ------------------
  |  Branch (406:7): [True: 62.6k, False: 0]
  ------------------
  407|  62.6k|    --buf->ref_count;
  408|       |    // Reference counts should never become negative. If this assertion fails,
  409|       |    // there is a bug in our reference count management.
  410|  62.6k|    assert(buf->ref_count >= 0);
  411|       |    // A worker may only get a free framebuffer index when calling get_free_fb.
  412|       |    // But the raw frame buffer is not set up until we finish decoding header.
  413|       |    // So if any error happens during decoding header, frame_bufs[idx] will not
  414|       |    // have a valid raw frame buffer.
  415|  62.6k|    if (buf->ref_count == 0 && buf->raw_frame_buffer.data) {
  ------------------
  |  Branch (415:9): [True: 14.1k, False: 48.5k]
  |  Branch (415:32): [True: 14.1k, False: 0]
  ------------------
  416|  14.1k|      pool->release_fb_cb(pool->cb_priv, &buf->raw_frame_buffer);
  417|  14.1k|      buf->raw_frame_buffer.data = NULL;
  418|  14.1k|      buf->raw_frame_buffer.size = 0;
  419|  14.1k|      buf->raw_frame_buffer.priv = NULL;
  420|  14.1k|    }
  421|  62.6k|  }
  422|  62.6k|}
decodeframe.c:decrease_ref_count:
  405|  1.20M|                                      BufferPool *const pool) {
  406|  1.20M|  if (buf != NULL) {
  ------------------
  |  Branch (406:7): [True: 395k, False: 805k]
  ------------------
  407|   395k|    --buf->ref_count;
  408|       |    // Reference counts should never become negative. If this assertion fails,
  409|       |    // there is a bug in our reference count management.
  410|   395k|    assert(buf->ref_count >= 0);
  411|       |    // A worker may only get a free framebuffer index when calling get_free_fb.
  412|       |    // But the raw frame buffer is not set up until we finish decoding header.
  413|       |    // So if any error happens during decoding header, frame_bufs[idx] will not
  414|       |    // have a valid raw frame buffer.
  415|   395k|    if (buf->ref_count == 0 && buf->raw_frame_buffer.data) {
  ------------------
  |  Branch (415:9): [True: 248k, False: 146k]
  |  Branch (415:32): [True: 248k, False: 467]
  ------------------
  416|   248k|      pool->release_fb_cb(pool->cb_priv, &buf->raw_frame_buffer);
  417|   248k|      buf->raw_frame_buffer.data = NULL;
  418|   248k|      buf->raw_frame_buffer.size = 0;
  419|   248k|      buf->raw_frame_buffer.priv = NULL;
  420|   248k|    }
  421|   395k|  }
  422|  1.20M|}
decoder.c:decrease_ref_count:
  405|   640k|                                      BufferPool *const pool) {
  406|   640k|  if (buf != NULL) {
  ------------------
  |  Branch (406:7): [True: 459k, False: 181k]
  ------------------
  407|   459k|    --buf->ref_count;
  408|       |    // Reference counts should never become negative. If this assertion fails,
  409|       |    // there is a bug in our reference count management.
  410|   459k|    assert(buf->ref_count >= 0);
  411|       |    // A worker may only get a free framebuffer index when calling get_free_fb.
  412|       |    // But the raw frame buffer is not set up until we finish decoding header.
  413|       |    // So if any error happens during decoding header, frame_bufs[idx] will not
  414|       |    // have a valid raw frame buffer.
  415|   459k|    if (buf->ref_count == 0 && buf->raw_frame_buffer.data) {
  ------------------
  |  Branch (415:9): [True: 293k, False: 165k]
  |  Branch (415:32): [True: 180k, False: 112k]
  ------------------
  416|   180k|      pool->release_fb_cb(pool->cb_priv, &buf->raw_frame_buffer);
  417|   180k|      buf->raw_frame_buffer.data = NULL;
  418|   180k|      buf->raw_frame_buffer.size = 0;
  419|   180k|      buf->raw_frame_buffer.priv = NULL;
  420|   180k|    }
  421|   459k|  }
  422|   640k|}
detokenize.c:av1_read_uniform:
  425|   161k|static inline int av1_read_uniform(aom_reader *r, int n) {
  426|   161k|  const int l = get_unsigned_bits(n);
  427|   161k|  const int m = (1 << l) - n;
  428|   161k|  const int v = aom_read_literal(r, l - 1, ACCT_STR);
  ------------------
  |  |   47|   161k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  429|   161k|  assert(l != 0);
  430|   161k|  if (v < m)
  ------------------
  |  Branch (430:7): [True: 127k, False: 34.3k]
  ------------------
  431|   127k|    return v;
  432|  34.3k|  else
  433|  34.3k|    return (v << 1) - m + aom_read_literal(r, 1, ACCT_STR);
  ------------------
  |  |   47|  34.3k|  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  434|   161k|}

av1_read_coeffs_txb:
  327|  36.2M|                         const int col, const TX_SIZE tx_size) {
  328|       |#if TXCOEFF_TIMER
  329|       |  struct aom_usec_timer timer;
  330|       |  aom_usec_timer_start(&timer);
  331|       |#endif
  332|  36.2M|  MACROBLOCKD *const xd = &dcb->xd;
  333|  36.2M|  MB_MODE_INFO *const mbmi = xd->mi[0];
  334|  36.2M|  struct macroblockd_plane *const pd = &xd->plane[plane];
  335|       |
  336|  36.2M|  const BLOCK_SIZE bsize = mbmi->bsize;
  337|  36.2M|  assert(bsize < BLOCK_SIZES_ALL);
  338|  36.2M|  const BLOCK_SIZE plane_bsize =
  339|  36.2M|      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
  340|       |
  341|  36.2M|  TXB_CTX txb_ctx;
  342|  36.2M|  get_txb_ctx(plane_bsize, tx_size, plane, pd->above_entropy_context + col,
  343|  36.2M|              pd->left_entropy_context + row, &txb_ctx);
  344|  36.2M|  const uint8_t cul_level =
  345|  36.2M|      read_coeffs_txb(cm, dcb, r, row, col, plane, &txb_ctx, tx_size);
  346|  36.2M|  av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, col,
  347|  36.2M|                           row);
  348|       |
  349|  36.2M|  if (is_inter_block(mbmi)) {
  ------------------
  |  Branch (349:7): [True: 10.7M, False: 25.5M]
  ------------------
  350|  10.7M|    const PLANE_TYPE plane_type = get_plane_type(plane);
  351|       |    // tx_type will be read out in av1_read_coeffs_txb_facade
  352|  10.7M|    const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, row, col, tx_size,
  353|  10.7M|                                            cm->features.reduced_tx_set_used);
  354|       |
  355|  10.7M|    if (plane == 0) {
  ------------------
  |  Branch (355:9): [True: 4.54M, False: 6.22M]
  ------------------
  356|  4.54M|      const int txw = tx_size_wide_unit[tx_size];
  357|  4.54M|      const int txh = tx_size_high_unit[tx_size];
  358|       |      // The 16x16 unit is due to the constraint from tx_64x64 which sets the
  359|       |      // maximum tx size for chroma as 32x32. Coupled with 4x1 transform block
  360|       |      // size, the constraint takes effect in 32x16 / 16x32 size too. To solve
  361|       |      // the intricacy, cover all the 16x16 units inside a 64 level transform.
  362|  4.54M|      if (txw == tx_size_wide_unit[TX_64X64] ||
  ------------------
  |  Branch (362:11): [True: 154k, False: 4.39M]
  ------------------
  363|  4.54M|          txh == tx_size_high_unit[TX_64X64]) {
  ------------------
  |  Branch (363:11): [True: 27.5k, False: 4.36M]
  ------------------
  364|   182k|        const int tx_unit = tx_size_wide_unit[TX_16X16];
  365|   182k|        const int stride = xd->tx_type_map_stride;
  366|   790k|        for (int idy = 0; idy < txh; idy += tx_unit) {
  ------------------
  |  Branch (366:27): [True: 607k, False: 182k]
  ------------------
  367|  2.75M|          for (int idx = 0; idx < txw; idx += tx_unit) {
  ------------------
  |  Branch (367:29): [True: 2.14M, False: 607k]
  ------------------
  368|  2.14M|            xd->tx_type_map[(row + idy) * stride + col + idx] = tx_type;
  369|  2.14M|          }
  370|   607k|        }
  371|   182k|      }
  372|  4.54M|    }
  373|  10.7M|  }
  374|       |
  375|       |#if TXCOEFF_TIMER
  376|       |  aom_usec_timer_mark(&timer);
  377|       |  const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
  378|       |  cm->txcoeff_timer += elapsed_time;
  379|       |  ++cm->txb_count;
  380|       |#endif
  381|  36.2M|}
decodetxb.c:read_coeffs_txb:
  114|  36.4M|                               const TX_SIZE tx_size) {
  115|  36.4M|  MACROBLOCKD *const xd = &dcb->xd;
  116|  36.4M|  FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
  117|  36.4M|  const int32_t max_value = (1 << (7 + xd->bd)) - 1;
  118|  36.4M|  const int32_t min_value = -(1 << (7 + xd->bd));
  119|  36.4M|  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
  120|  36.4M|  const PLANE_TYPE plane_type = get_plane_type(plane);
  121|  36.4M|  MB_MODE_INFO *const mbmi = xd->mi[0];
  122|  36.4M|  struct macroblockd_plane *const pd = &xd->plane[plane];
  123|  36.4M|  const int16_t *const dequant = pd->seg_dequant_QTX[mbmi->segment_id];
  124|  36.4M|  tran_low_t *const tcoeffs = dcb->dqcoeff_block[plane] + dcb->cb_offset[plane];
  125|  36.4M|  const int shift = av1_get_tx_scale(tx_size);
  126|  36.4M|  const int bhl = get_txb_bhl(tx_size);
  127|  36.4M|  const int width = get_txb_wide(tx_size);
  128|  36.4M|  const int height = get_txb_high(tx_size);
  129|  36.4M|  int cul_level = 0;
  130|  36.4M|  int dc_val = 0;
  131|  36.4M|  uint8_t levels_buf[TX_PAD_2D];
  132|  36.4M|  uint8_t *const levels = set_levels(levels_buf, height);
  133|  36.4M|  const int all_zero = aom_read_symbol(
  ------------------
  |  |   51|  36.4M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  134|  36.4M|      r, ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2, ACCT_STR);
  135|  36.4M|  eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane];
  136|  36.4M|  uint16_t *const eob = &(eob_data->eob);
  137|  36.4M|  uint16_t *const max_scan_line = &(eob_data->max_scan_line);
  138|  36.4M|  *max_scan_line = 0;
  139|  36.4M|  *eob = 0;
  140|       |
  141|       |#if CONFIG_INSPECTION
  142|       |  if (plane == 0) {
  143|       |    const int txk_type_idx =
  144|       |        av1_get_txk_type_index(mbmi->bsize, blk_row, blk_col);
  145|       |    mbmi->tx_skip[txk_type_idx] = all_zero;
  146|       |  }
  147|       |#endif
  148|       |
  149|  36.4M|  if (all_zero) {
  ------------------
  |  Branch (149:7): [True: 14.3M, False: 22.0M]
  ------------------
  150|  14.3M|    *max_scan_line = 0;
  151|  14.3M|    if (plane == 0) {
  ------------------
  |  Branch (151:9): [True: 3.37M, False: 11.0M]
  ------------------
  152|  3.37M|      xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col] = DCT_DCT;
  153|  3.37M|    }
  154|  14.3M|    return 0;
  155|  14.3M|  }
  156|       |
  157|  22.0M|  if (plane == AOM_PLANE_Y) {
  ------------------
  |  |  226|  22.0M|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
  |  Branch (157:7): [True: 12.0M, False: 9.98M]
  ------------------
  158|       |    // only y plane's tx_type is transmitted
  159|  12.0M|    av1_read_tx_type(cm, xd, blk_row, blk_col, tx_size, r);
  160|  12.0M|  }
  161|  22.0M|  const TX_TYPE tx_type =
  162|  22.0M|      av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
  163|  22.0M|                      cm->features.reduced_tx_set_used);
  164|  22.0M|  const TX_CLASS tx_class = tx_type_to_class[tx_type];
  165|  22.0M|  const qm_val_t *iqmatrix =
  166|  22.0M|      av1_get_iqmatrix(&cm->quant_params, xd, plane, tx_size, tx_type);
  167|  22.0M|  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
  168|  22.0M|  const int16_t *const scan = scan_order->scan;
  169|  22.0M|  int eob_extra = 0;
  170|  22.0M|  int eob_pt = 1;
  171|       |
  172|  22.0M|  const int eob_multi_size = txsize_log2_minus4[tx_size];
  173|  22.0M|  const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
  ------------------
  |  Branch (173:29): [True: 20.3M, False: 1.67M]
  ------------------
  174|  22.0M|  switch (eob_multi_size) {
  175|  7.52M|    case 0:
  ------------------
  |  Branch (175:5): [True: 7.52M, False: 14.4M]
  ------------------
  176|  7.52M|      eob_pt =
  177|  7.52M|          aom_read_symbol(r, ec_ctx->eob_flag_cdf16[plane_type][eob_multi_ctx],
  ------------------
  |  |   51|  7.52M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  178|  7.52M|                          5, ACCT_STR) +
  179|  7.52M|          1;
  180|  7.52M|      break;
  181|  2.55M|    case 1:
  ------------------
  |  Branch (181:5): [True: 2.55M, False: 19.4M]
  ------------------
  182|  2.55M|      eob_pt =
  183|  2.55M|          aom_read_symbol(r, ec_ctx->eob_flag_cdf32[plane_type][eob_multi_ctx],
  ------------------
  |  |   51|  2.55M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  184|  2.55M|                          6, ACCT_STR) +
  185|  2.55M|          1;
  186|  2.55M|      break;
  187|  4.83M|    case 2:
  ------------------
  |  Branch (187:5): [True: 4.83M, False: 17.1M]
  ------------------
  188|  4.83M|      eob_pt =
  189|  4.83M|          aom_read_symbol(r, ec_ctx->eob_flag_cdf64[plane_type][eob_multi_ctx],
  ------------------
  |  |   51|  4.83M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  190|  4.83M|                          7, ACCT_STR) +
  191|  4.83M|          1;
  192|  4.83M|      break;
  193|  1.75M|    case 3:
  ------------------
  |  Branch (193:5): [True: 1.75M, False: 20.2M]
  ------------------
  194|  1.75M|      eob_pt =
  195|  1.75M|          aom_read_symbol(r, ec_ctx->eob_flag_cdf128[plane_type][eob_multi_ctx],
  ------------------
  |  |   51|  1.75M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  196|  1.75M|                          8, ACCT_STR) +
  197|  1.75M|          1;
  198|  1.75M|      break;
  199|  2.99M|    case 4:
  ------------------
  |  Branch (199:5): [True: 2.99M, False: 19.0M]
  ------------------
  200|  2.99M|      eob_pt =
  201|  2.99M|          aom_read_symbol(r, ec_ctx->eob_flag_cdf256[plane_type][eob_multi_ctx],
  ------------------
  |  |   51|  2.99M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  202|  2.99M|                          9, ACCT_STR) +
  203|  2.99M|          1;
  204|  2.99M|      break;
  205|   850k|    case 5:
  ------------------
  |  Branch (205:5): [True: 850k, False: 21.1M]
  ------------------
  206|   850k|      eob_pt =
  207|   850k|          aom_read_symbol(r, ec_ctx->eob_flag_cdf512[plane_type][eob_multi_ctx],
  ------------------
  |  |   51|   850k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  208|   850k|                          10, ACCT_STR) +
  209|   850k|          1;
  210|   850k|      break;
  211|  1.08M|    case 6:
  ------------------
  |  Branch (211:5): [True: 1.08M, False: 20.9M]
  ------------------
  212|  1.08M|    default:
  ------------------
  |  Branch (212:5): [True: 0, False: 22.0M]
  ------------------
  213|  1.08M|      eob_pt = aom_read_symbol(
  ------------------
  |  |   51|  1.08M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  214|  1.08M|                   r, ec_ctx->eob_flag_cdf1024[plane_type][eob_multi_ctx], 11,
  215|  1.08M|                   ACCT_STR) +
  216|  1.08M|               1;
  217|  1.08M|      break;
  218|  22.0M|  }
  219|       |
  220|  21.6M|  const int eob_offset_bits = av1_eob_offset_bits[eob_pt];
  221|  21.6M|  if (eob_offset_bits > 0) {
  ------------------
  |  Branch (221:7): [True: 11.2M, False: 10.3M]
  ------------------
  222|  11.2M|    const int eob_ctx = eob_pt - 3;
  223|  11.2M|    int bit = aom_read_symbol(
  ------------------
  |  |   51|  11.2M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  224|  11.2M|        r, ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2, ACCT_STR);
  225|  11.2M|    if (bit) {
  ------------------
  |  Branch (225:9): [True: 4.44M, False: 6.83M]
  ------------------
  226|  4.44M|      eob_extra += (1 << (eob_offset_bits - 1));
  227|  4.44M|    }
  228|       |
  229|  34.6M|    for (int i = 1; i < eob_offset_bits; i++) {
  ------------------
  |  Branch (229:21): [True: 23.4M, False: 11.2M]
  ------------------
  230|  23.4M|      bit = aom_read_bit(r, ACCT_STR);
  ------------------
  |  |   43|  23.4M|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  231|  23.4M|      if (bit) {
  ------------------
  |  Branch (231:11): [True: 11.4M, False: 11.9M]
  ------------------
  232|  11.4M|        eob_extra += (1 << (eob_offset_bits - 1 - i));
  233|  11.4M|      }
  234|  23.4M|    }
  235|  11.2M|  }
  236|  21.6M|  *eob = rec_eob_pos(eob_pt, eob_extra);
  237|       |
  238|  21.6M|  if (*eob > 1) {
  ------------------
  |  Branch (238:7): [True: 12.0M, False: 9.59M]
  ------------------
  239|  12.0M|    memset(levels_buf, 0,
  240|  12.0M|           sizeof(*levels_buf) *
  241|  12.0M|               ((height + TX_PAD_HOR) * (width + TX_PAD_VER) + TX_PAD_END));
  ------------------
  |  |  190|  12.0M|#define TX_PAD_HOR 4
  ------------------
                             ((height + TX_PAD_HOR) * (width + TX_PAD_VER) + TX_PAD_END));
  ------------------
  |  |  195|  12.0M|#define TX_PAD_VER (TX_PAD_TOP + TX_PAD_BOTTOM)
  |  |  ------------------
  |  |  |  |  193|  12.0M|#define TX_PAD_TOP 0
  |  |  ------------------
  |  |               #define TX_PAD_VER (TX_PAD_TOP + TX_PAD_BOTTOM)
  |  |  ------------------
  |  |  |  |  194|  12.0M|#define TX_PAD_BOTTOM 4
  |  |  ------------------
  ------------------
                             ((height + TX_PAD_HOR) * (width + TX_PAD_VER) + TX_PAD_END));
  ------------------
  |  |  197|  12.0M|#define TX_PAD_END 16
  ------------------
  242|  12.0M|  }
  243|       |
  244|  21.6M|  {
  245|       |    // Read the non-zero coefficient with scan index eob-1
  246|       |    // TODO(angiebird): Put this into a function
  247|  21.6M|    const int c = *eob - 1;
  248|  21.6M|    const int pos = scan[c];
  249|  21.6M|    const int coeff_ctx = get_lower_levels_ctx_eob(bhl, width, c);
  250|  21.6M|    const int nsymbs = 3;
  251|  21.6M|    aom_cdf_prob *cdf =
  252|  21.6M|        ec_ctx->coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx];
  253|  21.6M|    int level = aom_read_symbol(r, cdf, nsymbs, ACCT_STR) + 1;
  ------------------
  |  |   51|  21.6M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  254|  21.6M|    if (level > NUM_BASE_LEVELS) {
  ------------------
  |  |   46|  21.6M|#define NUM_BASE_LEVELS 2
  ------------------
  |  Branch (254:9): [True: 660k, False: 20.9M]
  ------------------
  255|   660k|      const int br_ctx = get_br_ctx_eob(pos, bhl, tx_class);
  256|   660k|      cdf = ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx];
  ------------------
  |  |   34|   660k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 589k, False: 71.2k]
  |  |  ------------------
  ------------------
  257|   912k|      for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
  ------------------
  |  |   49|   912k|#define COEFF_BASE_RANGE (4 * (BR_CDF_SIZE - 1))
  |  |  ------------------
  |  |  |  |   48|   912k|#define BR_CDF_SIZE (4)
  |  |  ------------------
  ------------------
                    for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
  ------------------
  |  |   48|   252k|#define BR_CDF_SIZE (4)
  ------------------
  |  Branch (257:25): [True: 896k, False: 16.2k]
  ------------------
  258|   896k|        const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR);
  ------------------
  |  |   51|   896k|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  259|   896k|        level += k;
  260|   896k|        if (k < BR_CDF_SIZE - 1) break;
  ------------------
  |  |   48|   896k|#define BR_CDF_SIZE (4)
  ------------------
  |  Branch (260:13): [True: 644k, False: 252k]
  ------------------
  261|   896k|      }
  262|   660k|    }
  263|  21.6M|    levels[get_padded_idx(pos, bhl)] = level;
  264|  21.6M|  }
  265|  21.6M|  if (*eob > 1) {
  ------------------
  |  Branch (265:7): [True: 12.0M, False: 9.64M]
  ------------------
  266|  12.0M|    base_cdf_arr base_cdf = ec_ctx->coeff_base_cdf[txs_ctx][plane_type];
  267|  12.0M|    br_cdf_arr br_cdf =
  268|  12.0M|        ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type];
  ------------------
  |  |   34|  12.0M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 10.7M, False: 1.27M]
  |  |  ------------------
  ------------------
  269|  12.0M|    if (tx_class == TX_CLASS_2D) {
  ------------------
  |  Branch (269:9): [True: 10.9M, False: 1.03M]
  ------------------
  270|  10.9M|      read_coeffs_reverse_2d(r, tx_size, 1, *eob - 1 - 1, scan, bhl, levels,
  271|  10.9M|                             base_cdf, br_cdf);
  272|  10.9M|      read_coeffs_reverse(r, tx_size, tx_class, 0, 0, scan, bhl, levels,
  273|  10.9M|                          base_cdf, br_cdf);
  274|  10.9M|    } else {
  275|  1.03M|      read_coeffs_reverse(r, tx_size, tx_class, 0, *eob - 1 - 1, scan, bhl,
  276|  1.03M|                          levels, base_cdf, br_cdf);
  277|  1.03M|    }
  278|  12.0M|  }
  279|       |
  280|   297M|  for (int c = 0; c < *eob; ++c) {
  ------------------
  |  Branch (280:19): [True: 275M, False: 21.6M]
  ------------------
  281|   275M|    const int pos = scan[c];
  282|   275M|    uint8_t sign;
  283|   275M|    tran_low_t level = levels[get_padded_idx(pos, bhl)];
  284|   275M|    if (level) {
  ------------------
  |  Branch (284:9): [True: 102M, False: 173M]
  ------------------
  285|   102M|      *max_scan_line = AOMMAX(*max_scan_line, pos);
  ------------------
  |  |   35|   102M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 39.6M, False: 62.3M]
  |  |  ------------------
  ------------------
  286|   102M|      if (c == 0) {
  ------------------
  |  Branch (286:11): [True: 17.8M, False: 84.1M]
  ------------------
  287|  17.8M|        const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
  288|  17.8M|        sign = aom_read_symbol(r, ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx],
  ------------------
  |  |   51|  17.8M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  289|  17.8M|                               2, ACCT_STR);
  290|  84.1M|      } else {
  291|  84.1M|        sign = aom_read_bit(r, ACCT_STR);
  ------------------
  |  |   43|  84.1M|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  292|  84.1M|      }
  293|   102M|      if (level >= MAX_BASE_BR_RANGE) {
  ------------------
  |  |   53|   102M|#define MAX_BASE_BR_RANGE (COEFF_BASE_RANGE + NUM_BASE_LEVELS + 1)
  |  |  ------------------
  |  |  |  |   49|   102M|#define COEFF_BASE_RANGE (4 * (BR_CDF_SIZE - 1))
  |  |  |  |  ------------------
  |  |  |  |  |  |   48|   102M|#define BR_CDF_SIZE (4)
  |  |  |  |  ------------------
  |  |  ------------------
  |  |               #define MAX_BASE_BR_RANGE (COEFF_BASE_RANGE + NUM_BASE_LEVELS + 1)
  |  |  ------------------
  |  |  |  |   46|   102M|#define NUM_BASE_LEVELS 2
  |  |  ------------------
  ------------------
  |  Branch (293:11): [True: 901k, False: 101M]
  ------------------
  294|   901k|        level += read_golomb(xd, r);
  295|   901k|      }
  296|       |
  297|   102M|      if (c == 0) dc_val = sign ? -level : level;
  ------------------
  |  Branch (297:11): [True: 17.9M, False: 84.0M]
  |  Branch (297:28): [True: 5.65M, False: 12.3M]
  ------------------
  298|       |
  299|       |      // Bitmasking to clamp level to valid range:
  300|       |      //   The valid range for 8/10/12 bit vdieo is at most 14/16/18 bit
  301|   102M|      level &= 0xfffff;
  302|   102M|      cul_level += level;
  303|   102M|      tran_low_t dq_coeff;
  304|       |      // Bitmasking to clamp dq_coeff to valid range:
  305|       |      //   The valid range for 8/10/12 bit video is at most 17/19/21 bit
  306|   102M|      dq_coeff =
  307|   102M|          (tran_low_t)((int64_t)level * get_dqv(dequant, scan[c], iqmatrix) &
  308|   102M|                       0xffffff);
  309|   102M|      dq_coeff = dq_coeff >> shift;
  310|   102M|      if (sign) {
  ------------------
  |  Branch (310:11): [True: 49.0M, False: 52.9M]
  ------------------
  311|  49.0M|        dq_coeff = -dq_coeff;
  312|  49.0M|      }
  313|   102M|      tcoeffs[pos] = clamp(dq_coeff, min_value, max_value);
  314|   102M|    }
  315|   275M|  }
  316|       |
  317|  21.6M|  cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level);
  ------------------
  |  |   34|  21.6M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 5.74M, False: 15.9M]
  |  |  ------------------
  ------------------
  318|       |
  319|       |  // DC value
  320|  21.6M|  set_dc_sign(&cul_level, dc_val);
  321|       |
  322|  21.6M|  return cul_level;
  323|  22.0M|}
decodetxb.c:rec_eob_pos:
   45|  21.7M|static inline int rec_eob_pos(const int eob_token, const int extra) {
   46|  21.7M|  int eob = av1_eob_group_start[eob_token];
   47|  21.7M|  if (eob > 2) {
  ------------------
  |  Branch (47:7): [True: 11.2M, False: 10.4M]
  ------------------
   48|  11.2M|    eob += extra;
   49|  11.2M|  }
   50|  21.7M|  return eob;
   51|  21.7M|}
decodetxb.c:read_coeffs_reverse_2d:
   67|  10.9M|                                          br_cdf_arr br_cdf) {
   68|   235M|  for (int c = end_si; c >= start_si; --c) {
  ------------------
  |  Branch (68:24): [True: 224M, False: 10.9M]
  ------------------
   69|   224M|    const int pos = scan[c];
   70|   224M|    const int coeff_ctx = get_lower_levels_ctx_2d(levels, pos, bhl, tx_size);
   71|   224M|    const int nsymbs = 4;
   72|   224M|    int level = aom_read_symbol(r, base_cdf[coeff_ctx], nsymbs, ACCT_STR);
  ------------------
  |  |   51|   224M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
   73|   224M|    if (level > NUM_BASE_LEVELS) {
  ------------------
  |  |   46|   224M|#define NUM_BASE_LEVELS 2
  ------------------
  |  Branch (73:9): [True: 15.0M, False: 209M]
  ------------------
   74|  15.0M|      const int br_ctx = get_br_ctx_2d(levels, pos, bhl);
   75|  15.0M|      aom_cdf_prob *cdf = br_cdf[br_ctx];
   76|  21.3M|      for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
  ------------------
  |  |   49|  21.3M|#define COEFF_BASE_RANGE (4 * (BR_CDF_SIZE - 1))
  |  |  ------------------
  |  |  |  |   48|  21.3M|#define BR_CDF_SIZE (4)
  |  |  ------------------
  ------------------
                    for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
  ------------------
  |  |   48|  6.32M|#define BR_CDF_SIZE (4)
  ------------------
  |  Branch (76:25): [True: 20.8M, False: 511k]
  ------------------
   77|  20.8M|        const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR);
  ------------------
  |  |   51|  20.8M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
   78|  20.8M|        level += k;
   79|  20.8M|        if (k < BR_CDF_SIZE - 1) break;
  ------------------
  |  |   48|  20.8M|#define BR_CDF_SIZE (4)
  ------------------
  |  Branch (79:13): [True: 14.5M, False: 6.32M]
  ------------------
   80|  20.8M|      }
   81|  15.0M|    }
   82|   224M|    levels[get_padded_idx(pos, bhl)] = level;
   83|   224M|  }
   84|  10.9M|}
decodetxb.c:read_coeffs_reverse:
   90|  11.9M|                                       br_cdf_arr br_cdf) {
   91|  42.4M|  for (int c = end_si; c >= start_si; --c) {
  ------------------
  |  Branch (91:24): [True: 30.4M, False: 11.9M]
  ------------------
   92|  30.4M|    const int pos = scan[c];
   93|  30.4M|    const int coeff_ctx =
   94|  30.4M|        get_lower_levels_ctx(levels, pos, bhl, tx_size, tx_class);
   95|  30.4M|    const int nsymbs = 4;
   96|  30.4M|    int level = aom_read_symbol(r, base_cdf[coeff_ctx], nsymbs, ACCT_STR);
  ------------------
  |  |   51|  30.4M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
   97|  30.4M|    if (level > NUM_BASE_LEVELS) {
  ------------------
  |  |   46|  30.4M|#define NUM_BASE_LEVELS 2
  ------------------
  |  Branch (97:9): [True: 4.94M, False: 25.5M]
  ------------------
   98|  4.94M|      const int br_ctx = get_br_ctx(levels, pos, bhl, tx_class);
   99|  4.94M|      aom_cdf_prob *cdf = br_cdf[br_ctx];
  100|  8.13M|      for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
  ------------------
  |  |   49|  8.13M|#define COEFF_BASE_RANGE (4 * (BR_CDF_SIZE - 1))
  |  |  ------------------
  |  |  |  |   48|  8.13M|#define BR_CDF_SIZE (4)
  |  |  ------------------
  ------------------
                    for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
  ------------------
  |  |   48|  3.19M|#define BR_CDF_SIZE (4)
  ------------------
  |  Branch (100:25): [True: 7.82M, False: 311k]
  ------------------
  101|  7.82M|        const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR);
  ------------------
  |  |   51|  7.82M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
  102|  7.82M|        level += k;
  103|  7.82M|        if (k < BR_CDF_SIZE - 1) break;
  ------------------
  |  |   48|  7.82M|#define BR_CDF_SIZE (4)
  ------------------
  |  Branch (103:13): [True: 4.63M, False: 3.19M]
  ------------------
  104|  7.82M|      }
  105|  4.94M|    }
  106|  30.4M|    levels[get_padded_idx(pos, bhl)] = level;
  107|  30.4M|  }
  108|  11.9M|}
decodetxb.c:read_golomb:
   22|   902k|static int read_golomb(MACROBLOCKD *xd, aom_reader *r) {
   23|   902k|  int x = 1;
   24|   902k|  int length = 0;
   25|   902k|  int i = 0;
   26|       |
   27|  2.83M|  while (!i) {
  ------------------
  |  Branch (27:10): [True: 1.93M, False: 901k]
  ------------------
   28|  1.93M|    i = aom_read_bit(r, ACCT_STR);
  ------------------
  |  |   43|  1.93M|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
   29|  1.93M|    ++length;
   30|  1.93M|    if (length > 20) {
  ------------------
  |  Branch (30:9): [True: 430, False: 1.93M]
  ------------------
   31|    430|      aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
   32|    430|                         "Invalid length in read_golomb");
   33|    430|      break;
   34|    430|    }
   35|  1.93M|  }
   36|       |
   37|  1.93M|  for (i = 0; i < length - 1; ++i) {
  ------------------
  |  Branch (37:15): [True: 1.02M, False: 902k]
  ------------------
   38|  1.02M|    x <<= 1;
   39|  1.02M|    x += aom_read_bit(r, ACCT_STR);
  ------------------
  |  |   43|  1.02M|  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
   40|  1.02M|  }
   41|       |
   42|   902k|  return x - 1;
   43|   902k|}
decodetxb.c:get_dqv:
   54|   102M|                          const qm_val_t *iqmatrix) {
   55|   102M|  int dqv = dequant[!!coeff_idx];
   56|   102M|  if (iqmatrix != NULL)
  ------------------
  |  Branch (56:7): [True: 2.09M, False: 100M]
  ------------------
   57|  2.09M|    dqv =
   58|  2.09M|        ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
  ------------------
  |  |   62|  2.09M|#define AOM_QM_BITS 5
  ------------------
                      ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
  ------------------
  |  |   62|  2.09M|#define AOM_QM_BITS 5
  ------------------
   59|   102M|  return dqv;
   60|   102M|}

av1_decode_palette_tokens:
   66|   161k|                               aom_reader *r) {
   67|   161k|  assert(plane == 0 || plane == 1);
   68|   161k|  Av1ColorMapParam params;
   69|   161k|  params.color_map =
   70|   161k|      xd->plane[plane].color_index_map + xd->color_index_map_offset[plane];
   71|   161k|  params.map_cdf = plane ? xd->tile_ctx->palette_uv_color_index_cdf
  ------------------
  |  Branch (71:20): [True: 73.1k, False: 88.6k]
  ------------------
   72|   161k|                         : xd->tile_ctx->palette_y_color_index_cdf;
   73|   161k|  const MB_MODE_INFO *const mbmi = xd->mi[0];
   74|   161k|  params.n_colors = mbmi->palette_mode_info.palette_size[plane];
   75|   161k|  av1_get_block_dimensions(mbmi->bsize, plane, xd, &params.plane_width,
   76|   161k|                           &params.plane_height, &params.rows, &params.cols);
   77|   161k|  decode_color_map_tokens(&params, r);
   78|   161k|}
detokenize.c:decode_color_map_tokens:
   25|   161k|static void decode_color_map_tokens(Av1ColorMapParam *param, aom_reader *r) {
   26|   161k|  uint8_t color_order[PALETTE_MAX_SIZE];
   27|   161k|  const int n = param->n_colors;
   28|   161k|  uint8_t *const color_map = param->color_map;
   29|   161k|  MapCdf color_map_cdf = param->map_cdf;
   30|   161k|  int plane_block_width = param->plane_width;
   31|   161k|  int plane_block_height = param->plane_height;
   32|   161k|  int rows = param->rows;
   33|   161k|  int cols = param->cols;
   34|       |
   35|       |  // The first color index.
   36|   161k|  color_map[0] = av1_read_uniform(r, n);
   37|   161k|  assert(color_map[0] < n);
   38|       |
   39|       |  // Run wavefront on the palette map index decoding.
   40|  3.88M|  for (int i = 1; i < rows + cols - 1; ++i) {
  ------------------
  |  Branch (40:19): [True: 3.72M, False: 156k]
  ------------------
   41|  29.9M|    for (int j = AOMMIN(i, cols - 1); j >= AOMMAX(0, i - rows + 1); --j) {
  ------------------
  |  |   34|  3.72M|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.83M, False: 1.89M]
  |  |  ------------------
  ------------------
                  for (int j = AOMMIN(i, cols - 1); j >= AOMMAX(0, i - rows + 1); --j) {
  ------------------
  |  |   35|  29.9M|#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (35:23): [True: 12.8M, False: 17.1M]
  |  |  ------------------
  ------------------
  |  Branch (41:39): [True: 26.2M, False: 3.72M]
  ------------------
   42|  26.2M|      const int color_ctx = av1_get_palette_color_index_context(
   43|  26.2M|          color_map, plane_block_width, (i - j), j, n, color_order, NULL);
   44|  26.2M|      const int color_idx = aom_read_symbol(
  ------------------
  |  |   51|  26.2M|  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
  ------------------
   45|  26.2M|          r, color_map_cdf[n - PALETTE_MIN_SIZE][color_ctx], n, ACCT_STR);
   46|  26.2M|      assert(color_idx >= 0 && color_idx < n);
   47|  26.2M|      color_map[(i - j) * plane_block_width + j] = color_order[color_idx];
   48|  26.2M|    }
   49|  3.72M|  }
   50|       |  // Copy last column to extra columns.
   51|   156k|  if (cols < plane_block_width) {
  ------------------
  |  Branch (51:7): [True: 288, False: 156k]
  ------------------
   52|  9.31k|    for (int i = 0; i < rows; ++i) {
  ------------------
  |  Branch (52:21): [True: 9.02k, False: 288]
  ------------------
   53|  9.02k|      memset(color_map + i * plane_block_width + cols,
   54|  9.02k|             color_map[i * plane_block_width + cols - 1],
   55|  9.02k|             (plane_block_width - cols));
   56|  9.02k|    }
   57|    288|  }
   58|       |  // Copy last row to extra rows.
   59|   167k|  for (int i = rows; i < plane_block_height; ++i) {
  ------------------
  |  Branch (59:22): [True: 11.4k, False: 156k]
  ------------------
   60|  11.4k|    memcpy(color_map + i * plane_block_width,
   61|  11.4k|           color_map + (rows - 1) * plane_block_width, plane_block_width);
   62|  11.4k|  }
   63|   156k|}

av1_add_film_grain:
 1373|  14.6k|                       aom_image_t *dst) {
 1374|  14.6k|  uint8_t *luma, *cb, *cr;
 1375|  14.6k|  int height, width, luma_stride, chroma_stride;
 1376|  14.6k|  int use_high_bit_depth = 0;
 1377|  14.6k|  int chroma_subsamp_x = 0;
 1378|  14.6k|  int chroma_subsamp_y = 0;
 1379|  14.6k|  int mc_identity = src->mc == AOM_CICP_MC_IDENTITY ? 1 : 0;
  ------------------
  |  Branch (1379:21): [True: 96, False: 14.5k]
  ------------------
 1380|       |
 1381|  14.6k|  switch (src->fmt) {
 1382|      0|    case AOM_IMG_FMT_AOMI420:
  ------------------
  |  Branch (1382:5): [True: 0, False: 14.6k]
  ------------------
 1383|  1.71k|    case AOM_IMG_FMT_I420:
  ------------------
  |  Branch (1383:5): [True: 1.71k, False: 12.9k]
  ------------------
 1384|  1.71k|      use_high_bit_depth = 0;
 1385|  1.71k|      chroma_subsamp_x = 1;
 1386|  1.71k|      chroma_subsamp_y = 1;
 1387|  1.71k|      break;
 1388|  5.41k|    case AOM_IMG_FMT_I42016:
  ------------------
  |  Branch (1388:5): [True: 5.41k, False: 9.21k]
  ------------------
 1389|  5.41k|      use_high_bit_depth = 1;
 1390|  5.41k|      chroma_subsamp_x = 1;
 1391|  5.41k|      chroma_subsamp_y = 1;
 1392|  5.41k|      break;
 1393|       |      //    case AOM_IMG_FMT_444A:
 1394|  1.04k|    case AOM_IMG_FMT_I444:
  ------------------
  |  Branch (1394:5): [True: 1.04k, False: 13.5k]
  ------------------
 1395|  1.04k|      use_high_bit_depth = 0;
 1396|  1.04k|      chroma_subsamp_x = 0;
 1397|  1.04k|      chroma_subsamp_y = 0;
 1398|  1.04k|      break;
 1399|  4.12k|    case AOM_IMG_FMT_I44416:
  ------------------
  |  Branch (1399:5): [True: 4.12k, False: 10.5k]
  ------------------
 1400|  4.12k|      use_high_bit_depth = 1;
 1401|  4.12k|      chroma_subsamp_x = 0;
 1402|  4.12k|      chroma_subsamp_y = 0;
 1403|  4.12k|      break;
 1404|  2.18k|    case AOM_IMG_FMT_I422:
  ------------------
  |  Branch (1404:5): [True: 2.18k, False: 12.4k]
  ------------------
 1405|  2.18k|      use_high_bit_depth = 0;
 1406|  2.18k|      chroma_subsamp_x = 1;
 1407|  2.18k|      chroma_subsamp_y = 0;
 1408|  2.18k|      break;
 1409|    140|    case AOM_IMG_FMT_I42216:
  ------------------
  |  Branch (1409:5): [True: 140, False: 14.4k]
  ------------------
 1410|    140|      use_high_bit_depth = 1;
 1411|    140|      chroma_subsamp_x = 1;
 1412|    140|      chroma_subsamp_y = 0;
 1413|    140|      break;
 1414|      0|    default:  // unknown input format
  ------------------
  |  Branch (1414:5): [True: 0, False: 14.6k]
  ------------------
 1415|      0|      fprintf(stderr, "Film grain error: input format is not supported!");
 1416|      0|      return -1;
 1417|  14.6k|  }
 1418|       |
 1419|  14.6k|  assert(params->bit_depth == src->bit_depth);
 1420|       |
 1421|  14.6k|  dst->fmt = src->fmt;
 1422|  14.6k|  dst->bit_depth = src->bit_depth;
 1423|       |
 1424|  14.6k|  dst->r_w = src->r_w;
 1425|  14.6k|  dst->r_h = src->r_h;
 1426|  14.6k|  dst->d_w = src->d_w;
 1427|  14.6k|  dst->d_h = src->d_h;
 1428|       |
 1429|  14.6k|  dst->cp = src->cp;
 1430|  14.6k|  dst->tc = src->tc;
 1431|  14.6k|  dst->mc = src->mc;
 1432|       |
 1433|  14.6k|  dst->monochrome = src->monochrome;
 1434|  14.6k|  dst->csp = src->csp;
 1435|  14.6k|  dst->range = src->range;
 1436|       |
 1437|  14.6k|  dst->x_chroma_shift = src->x_chroma_shift;
 1438|  14.6k|  dst->y_chroma_shift = src->y_chroma_shift;
 1439|       |
 1440|  14.6k|  dst->temporal_id = src->temporal_id;
 1441|  14.6k|  dst->spatial_id = src->spatial_id;
 1442|       |
 1443|  14.6k|  width = src->d_w % 2 ? src->d_w + 1 : src->d_w;
  ------------------
  |  Branch (1443:11): [True: 10.8k, False: 3.76k]
  ------------------
 1444|  14.6k|  height = src->d_h % 2 ? src->d_h + 1 : src->d_h;
  ------------------
  |  Branch (1444:12): [True: 2.60k, False: 12.0k]
  ------------------
 1445|       |
 1446|  14.6k|  copy_rect(src->planes[AOM_PLANE_Y], src->stride[AOM_PLANE_Y],
  ------------------
  |  |  226|  14.6k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
                copy_rect(src->planes[AOM_PLANE_Y], src->stride[AOM_PLANE_Y],
  ------------------
  |  |  226|  14.6k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
 1447|  14.6k|            dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], src->d_w,
  ------------------
  |  |  226|  14.6k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
                          dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], src->d_w,
  ------------------
  |  |  226|  14.6k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
 1448|  14.6k|            src->d_h, use_high_bit_depth);
 1449|       |  // Note that dst is already assumed to be aligned to even.
 1450|  14.6k|  extend_even(dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], src->d_w,
  ------------------
  |  |  226|  14.6k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
                extend_even(dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], src->d_w,
  ------------------
  |  |  226|  14.6k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
 1451|  14.6k|              src->d_h, use_high_bit_depth);
 1452|       |
 1453|  14.6k|  if (!src->monochrome) {
  ------------------
  |  Branch (1453:7): [True: 14.5k, False: 41]
  ------------------
 1454|  14.5k|    copy_rect(src->planes[AOM_PLANE_U], src->stride[AOM_PLANE_U],
  ------------------
  |  |  227|  14.5k|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
                  copy_rect(src->planes[AOM_PLANE_U], src->stride[AOM_PLANE_U],
  ------------------
  |  |  227|  14.5k|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
 1455|  14.5k|              dst->planes[AOM_PLANE_U], dst->stride[AOM_PLANE_U],
  ------------------
  |  |  227|  14.5k|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
                            dst->planes[AOM_PLANE_U], dst->stride[AOM_PLANE_U],
  ------------------
  |  |  227|  14.5k|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
 1456|  14.5k|              width >> chroma_subsamp_x, height >> chroma_subsamp_y,
 1457|  14.5k|              use_high_bit_depth);
 1458|       |
 1459|  14.5k|    copy_rect(src->planes[AOM_PLANE_V], src->stride[AOM_PLANE_V],
  ------------------
  |  |  228|  14.5k|#define AOM_PLANE_V 2      /**< V (Chroma) plane */
  ------------------
                  copy_rect(src->planes[AOM_PLANE_V], src->stride[AOM_PLANE_V],
  ------------------
  |  |  228|  14.5k|#define AOM_PLANE_V 2      /**< V (Chroma) plane */
  ------------------
 1460|  14.5k|              dst->planes[AOM_PLANE_V], dst->stride[AOM_PLANE_V],
  ------------------
  |  |  228|  14.5k|#define AOM_PLANE_V 2      /**< V (Chroma) plane */
  ------------------
                            dst->planes[AOM_PLANE_V], dst->stride[AOM_PLANE_V],
  ------------------
  |  |  228|  14.5k|#define AOM_PLANE_V 2      /**< V (Chroma) plane */
  ------------------
 1461|  14.5k|              width >> chroma_subsamp_x, height >> chroma_subsamp_y,
 1462|  14.5k|              use_high_bit_depth);
 1463|  14.5k|  }
 1464|       |
 1465|  14.6k|  luma = dst->planes[AOM_PLANE_Y];
  ------------------
  |  |  226|  14.6k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
 1466|  14.6k|  cb = dst->planes[AOM_PLANE_U];
  ------------------
  |  |  227|  14.6k|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
 1467|  14.6k|  cr = dst->planes[AOM_PLANE_V];
  ------------------
  |  |  228|  14.6k|#define AOM_PLANE_V 2      /**< V (Chroma) plane */
  ------------------
 1468|       |
 1469|       |  // luma and chroma strides in samples
 1470|  14.6k|  luma_stride = dst->stride[AOM_PLANE_Y] >> use_high_bit_depth;
  ------------------
  |  |  226|  14.6k|#define AOM_PLANE_Y 0      /**< Y (Luminance) plane */
  ------------------
 1471|  14.6k|  chroma_stride = dst->stride[AOM_PLANE_U] >> use_high_bit_depth;
  ------------------
  |  |  227|  14.6k|#define AOM_PLANE_U 1      /**< U (Chroma) plane */
  ------------------
 1472|       |
 1473|  14.6k|  return add_film_grain_run(params, luma, cb, cr, height, width, luma_stride,
 1474|  14.6k|                            chroma_stride, use_high_bit_depth, chroma_subsamp_y,
 1475|  14.6k|                            chroma_subsamp_x, mc_identity);
 1476|  14.6k|}
grain_synthesis.c:copy_rect:
  862|  43.8k|                      int use_high_bit_depth) {
  863|  43.8k|  int hbd_coeff = use_high_bit_depth ? 2 : 1;
  ------------------
  |  Branch (863:19): [True: 28.9k, False: 14.8k]
  ------------------
  864|   939k|  while (height) {
  ------------------
  |  Branch (864:10): [True: 895k, False: 43.8k]
  ------------------
  865|   895k|    memcpy(dst, src, width * sizeof(uint8_t) * hbd_coeff);
  866|   895k|    src += src_stride;
  867|   895k|    dst += dst_stride;
  868|   895k|    --height;
  869|   895k|  }
  870|  43.8k|  return;
  871|  43.8k|}
grain_synthesis.c:extend_even:
  885|  14.6k|                        int use_high_bit_depth) {
  886|  14.6k|  if ((width & 1) == 0 && (height & 1) == 0) return;
  ------------------
  |  Branch (886:7): [True: 3.76k, False: 10.8k]
  |  Branch (886:27): [True: 1.47k, False: 2.29k]
  ------------------
  887|  13.1k|  if (use_high_bit_depth) {
  ------------------
  |  Branch (887:7): [True: 9.28k, False: 3.87k]
  ------------------
  888|  9.28k|    uint16_t *dst16 = (uint16_t *)dst;
  889|  9.28k|    int dst16_stride = dst_stride / 2;
  890|  9.28k|    if (width & 1) {
  ------------------
  |  Branch (890:9): [True: 9.16k, False: 112]
  ------------------
  891|   195k|      for (int i = 0; i < height; ++i)
  ------------------
  |  Branch (891:23): [True: 186k, False: 9.16k]
  ------------------
  892|   186k|        dst16[i * dst16_stride + width] = dst16[i * dst16_stride + width - 1];
  893|  9.16k|    }
  894|  9.28k|    width = (width + 1) & (~1);
  895|  9.28k|    if (height & 1) {
  ------------------
  |  Branch (895:9): [True: 153, False: 9.12k]
  ------------------
  896|    153|      memcpy(&dst16[height * dst16_stride], &dst16[(height - 1) * dst16_stride],
  897|    153|             sizeof(*dst16) * width);
  898|    153|    }
  899|  9.28k|  } else {
  900|  3.87k|    if (width & 1) {
  ------------------
  |  Branch (900:9): [True: 1.69k, False: 2.17k]
  ------------------
  901|  50.5k|      for (int i = 0; i < height; ++i)
  ------------------
  |  Branch (901:23): [True: 48.8k, False: 1.69k]
  ------------------
  902|  48.8k|        dst[i * dst_stride + width] = dst[i * dst_stride + width - 1];
  903|  1.69k|    }
  904|  3.87k|    width = (width + 1) & (~1);
  905|  3.87k|    if (height & 1) {
  ------------------
  |  Branch (905:9): [True: 2.45k, False: 1.41k]
  ------------------
  906|  2.45k|      memcpy(&dst[height * dst_stride], &dst[(height - 1) * dst_stride],
  907|  2.45k|             sizeof(*dst) * width);
  908|  2.45k|    }
  909|  3.87k|  }
  910|  13.1k|}
grain_synthesis.c:add_film_grain_run:
  991|  14.6k|                              int chroma_subsamp_x, int mc_identity) {
  992|  14.6k|  int **pred_pos_luma;
  993|  14.6k|  int **pred_pos_chroma;
  994|  14.6k|  int *luma_grain_block;
  995|  14.6k|  int *cb_grain_block;
  996|  14.6k|  int *cr_grain_block;
  997|       |
  998|  14.6k|  int *y_line_buf;
  999|  14.6k|  int *cb_line_buf;
 1000|  14.6k|  int *cr_line_buf;
 1001|       |
 1002|  14.6k|  int *y_col_buf;
 1003|  14.6k|  int *cb_col_buf;
 1004|  14.6k|  int *cr_col_buf;
 1005|       |
 1006|  14.6k|  random_register = params->random_seed;
 1007|       |
 1008|  14.6k|  int left_pad = 3;
 1009|  14.6k|  int right_pad = 3;  // padding to offset for AR coefficients
 1010|  14.6k|  int top_pad = 3;
 1011|  14.6k|  int bottom_pad = 0;
 1012|       |
 1013|  14.6k|  int ar_padding = 3;  // maximum lag used for stabilization of AR coefficients
 1014|       |
 1015|  14.6k|  luma_subblock_size_y = 32;
 1016|  14.6k|  luma_subblock_size_x = 32;
 1017|       |
 1018|  14.6k|  chroma_subblock_size_y = luma_subblock_size_y >> chroma_subsamp_y;
 1019|  14.6k|  chroma_subblock_size_x = luma_subblock_size_x >> chroma_subsamp_x;
 1020|       |
 1021|       |  // Initial padding is only needed for generation of
 1022|       |  // film grain templates (to stabilize the AR process)
 1023|       |  // Only a 64x64 luma and 32x32 chroma part of a template
 1024|       |  // is used later for adding grain, padding can be discarded
 1025|       |
 1026|  14.6k|  int luma_block_size_y =
 1027|  14.6k|      top_pad + 2 * ar_padding + luma_subblock_size_y * 2 + bottom_pad;
 1028|  14.6k|  int luma_block_size_x = left_pad + 2 * ar_padding + luma_subblock_size_x * 2 +
 1029|  14.6k|                          2 * ar_padding + right_pad;
 1030|       |
 1031|  14.6k|  int chroma_block_size_y = top_pad + (2 >> chroma_subsamp_y) * ar_padding +
 1032|  14.6k|                            chroma_subblock_size_y * 2 + bottom_pad;
 1033|  14.6k|  int chroma_block_size_x = left_pad + (2 >> chroma_subsamp_x) * ar_padding +
 1034|  14.6k|                            chroma_subblock_size_x * 2 +
 1035|  14.6k|                            (2 >> chroma_subsamp_x) * ar_padding + right_pad;
 1036|       |
 1037|  14.6k|  int luma_grain_stride = luma_block_size_x;
 1038|  14.6k|  int chroma_grain_stride = chroma_block_size_x;
 1039|       |
 1040|  14.6k|  int overlap = params->overlap_flag;
 1041|  14.6k|  int bit_depth = params->bit_depth;
 1042|       |
 1043|  14.6k|  const int grain_center = 128 << (bit_depth - 8);
 1044|  14.6k|  grain_min = 0 - grain_center;
 1045|  14.6k|  grain_max = grain_center - 1;
 1046|       |
 1047|  14.6k|  if (!init_arrays(params, luma_stride, chroma_stride, &pred_pos_luma,
  ------------------
  |  Branch (1047:7): [True: 0, False: 14.6k]
  ------------------
 1048|  14.6k|                   &pred_pos_chroma, &luma_grain_block, &cb_grain_block,
 1049|  14.6k|                   &cr_grain_block, &y_line_buf, &cb_line_buf, &cr_line_buf,
 1050|  14.6k|                   &y_col_buf, &cb_col_buf, &cr_col_buf,
 1051|  14.6k|                   luma_block_size_y * luma_block_size_x,
 1052|  14.6k|                   chroma_block_size_y * chroma_block_size_x, chroma_subsamp_y,
 1053|  14.6k|                   chroma_subsamp_x))
 1054|      0|    return -1;
 1055|       |
 1056|  14.6k|  generate_luma_grain_block(params, pred_pos_luma, luma_grain_block,
 1057|  14.6k|                            luma_block_size_y, luma_block_size_x,
 1058|  14.6k|                            luma_grain_stride, left_pad, top_pad, right_pad,
 1059|  14.6k|                            bottom_pad);
 1060|       |
 1061|  14.6k|  if (!generate_chroma_grain_blocks(
  ------------------
  |  Branch (1061:7): [True: 0, False: 14.6k]
  ------------------
 1062|  14.6k|          params, pred_pos_chroma, luma_grain_block, cb_grain_block,
 1063|  14.6k|          cr_grain_block, luma_grain_stride, chroma_block_size_y,
 1064|  14.6k|          chroma_block_size_x, chroma_grain_stride, left_pad, top_pad,
 1065|  14.6k|          right_pad, bottom_pad, chroma_subsamp_y, chroma_subsamp_x))
 1066|      0|    return -1;
 1067|       |
 1068|  14.6k|  init_scaling_function(params->scaling_points_y, params->num_y_points,
 1069|  14.6k|                        scaling_lut_y);
 1070|       |
 1071|  14.6k|  if (params->chroma_scaling_from_luma) {
  ------------------
  |  Branch (1071:7): [True: 8.35k, False: 6.28k]
  ------------------
 1072|  8.35k|    memcpy(scaling_lut_cb, scaling_lut_y, sizeof(*scaling_lut_y) * 256);
 1073|  8.35k|    memcpy(scaling_lut_cr, scaling_lut_y, sizeof(*scaling_lut_y) * 256);
 1074|  8.35k|  } else {
 1075|  6.28k|    init_scaling_function(params->scaling_points_cb, params->num_cb_points,
 1076|  6.28k|                          scaling_lut_cb);
 1077|  6.28k|    init_scaling_function(params->scaling_points_cr, params->num_cr_points,
 1078|  6.28k|                          scaling_lut_cr);
 1079|  6.28k|  }
 1080|  37.1k|  for (int y = 0; y < height / 2; y += (luma_subblock_size_y >> 1)) {
  ------------------
  |  Branch (1080:19): [True: 22.4k, False: 14.6k]
  ------------------
 1081|  22.4k|    init_random_generator(y * 2, params->random_seed);
 1082|       |
 1083|  75.5k|    for (int x = 0; x < width / 2; x += (luma_subblock_size_x >> 1)) {
  ------------------
  |  Branch (1083:21): [True: 53.0k, False: 22.4k]
  ------------------
 1084|  53.0k|      int offset_y = get_random_number(8);
 1085|  53.0k|      int offset_x = (offset_y >> 4) & 15;
 1086|  53.0k|      offset_y &= 15;
 1087|       |
 1088|  53.0k|      int luma_offset_y = left_pad + 2 * ar_padding + (offset_y << 1);
 1089|  53.0k|      int luma_offset_x = top_pad + 2 * ar_padding + (offset_x << 1);
 1090|       |
 1091|  53.0k|      int chroma_offset_y = top_pad + (2 >> chroma_subsamp_y) * ar_padding +
 1092|  53.0k|                            offset_y * (2 >> chroma_subsamp_y);
 1093|  53.0k|      int chroma_offset_x = left_pad + (2 >> chroma_subsamp_x) * ar_padding +
 1094|  53.0k|                            offset_x * (2 >> chroma_subsamp_x);
 1095|       |
 1096|  53.0k|      if (overlap && x) {
  ------------------
  |  Branch (1096:11): [True: 36.8k, False: 16.2k]
  |  Branch (1096:22): [True: 20.7k, False: 16.0k]
  ------------------
 1097|  20.7k|        ver_boundary_overlap(
 1098|  20.7k|            y_col_buf, 2,
 1099|  20.7k|            luma_grain_block + luma_offset_y * luma_grain_stride +
 1100|  20.7k|                luma_offset_x,
 1101|  20.7k|            luma_grain_stride, y_col_buf, 2, 2,
 1102|  20.7k|            AOMMIN(luma_subblock_size_y + 2, height - (y << 1)));
  ------------------
  |  |   34|  20.7k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 11.3k, False: 9.37k]
  |  |  ------------------
  ------------------
 1103|       |
 1104|  20.7k|        ver_boundary_overlap(
 1105|  20.7k|            cb_col_buf, 2 >> chroma_subsamp_x,
 1106|  20.7k|            cb_grain_block + chroma_offset_y * chroma_grain_stride +
 1107|  20.7k|                chroma_offset_x,
 1108|  20.7k|            chroma_grain_stride, cb_col_buf, 2 >> chroma_subsamp_x,
 1109|  20.7k|            2 >> chroma_subsamp_x,
 1110|  20.7k|            AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
  ------------------
  |  |   34|  20.7k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 11.3k, False: 9.37k]
  |  |  ------------------
  ------------------
 1111|  20.7k|                   (height - (y << 1)) >> chroma_subsamp_y));
 1112|       |
 1113|  20.7k|        ver_boundary_overlap(
 1114|  20.7k|            cr_col_buf, 2 >> chroma_subsamp_x,
 1115|  20.7k|            cr_grain_block + chroma_offset_y * chroma_grain_stride +
 1116|  20.7k|                chroma_offset_x,
 1117|  20.7k|            chroma_grain_stride, cr_col_buf, 2 >> chroma_subsamp_x,
 1118|  20.7k|            2 >> chroma_subsamp_x,
 1119|  20.7k|            AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
  ------------------
  |  |   34|  20.7k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 11.3k, False: 9.37k]
  |  |  ------------------
  ------------------
 1120|  20.7k|                   (height - (y << 1)) >> chroma_subsamp_y));
 1121|       |
 1122|  20.7k|        int i = y ? 1 : 0;
  ------------------
  |  Branch (1122:17): [True: 12.0k, False: 8.71k]
  ------------------
 1123|       |
 1124|  20.7k|        if (use_high_bit_depth) {
  ------------------
  |  Branch (1124:13): [True: 17.2k, False: 3.48k]
  ------------------
 1125|  17.2k|          add_noise_to_block_hbd(
 1126|  17.2k|              params,
 1127|  17.2k|              (uint16_t *)luma + ((y + i) << 1) * luma_stride + (x << 1),
 1128|  17.2k|              (uint16_t *)cb +
 1129|  17.2k|                  ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
 1130|  17.2k|                  (x << (1 - chroma_subsamp_x)),
 1131|  17.2k|              (uint16_t *)cr +
 1132|  17.2k|                  ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
 1133|  17.2k|                  (x << (1 - chroma_subsamp_x)),
 1134|  17.2k|              luma_stride, chroma_stride, y_col_buf + i * 4,
 1135|  17.2k|              cb_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
 1136|  17.2k|              cr_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
 1137|  17.2k|              2, (2 - chroma_subsamp_x),
 1138|  17.2k|              AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, 1,
  ------------------
  |  |   34|  17.2k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 10.2k, False: 7.01k]
  |  |  ------------------
  ------------------
 1139|  17.2k|              bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity);
 1140|  17.2k|        } else {
 1141|  3.48k|          add_noise_to_block(
 1142|  3.48k|              params, luma + ((y + i) << 1) * luma_stride + (x << 1),
 1143|  3.48k|              cb + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
 1144|  3.48k|                  (x << (1 - chroma_subsamp_x)),
 1145|  3.48k|              cr + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
 1146|  3.48k|                  (x << (1 - chroma_subsamp_x)),
 1147|  3.48k|              luma_stride, chroma_stride, y_col_buf + i * 4,
 1148|  3.48k|              cb_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
 1149|  3.48k|              cr_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
 1150|  3.48k|              2, (2 - chroma_subsamp_x),
 1151|  3.48k|              AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, 1,
  ------------------
  |  |   34|  3.48k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.78k, False: 1.69k]
  |  |  ------------------
  ------------------
 1152|  3.48k|              bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity);
 1153|  3.48k|        }
 1154|  20.7k|      }
 1155|       |
 1156|  53.0k|      if (overlap && y) {
  ------------------
  |  Branch (1156:11): [True: 36.8k, False: 16.2k]
  |  Branch (1156:22): [True: 19.1k, False: 17.6k]
  ------------------
 1157|  19.1k|        if (x) {
  ------------------
  |  Branch (1157:13): [True: 12.0k, False: 7.12k]
  ------------------
 1158|  12.0k|          hor_boundary_overlap(y_line_buf + (x << 1), luma_stride, y_col_buf, 2,
 1159|  12.0k|                               y_line_buf + (x << 1), luma_stride, 2, 2);
 1160|       |
 1161|  12.0k|          hor_boundary_overlap(cb_line_buf + x * (2 >> chroma_subsamp_x),
 1162|  12.0k|                               chroma_stride, cb_col_buf, 2 >> chroma_subsamp_x,
 1163|  12.0k|                               cb_line_buf + x * (2 >> chroma_subsamp_x),
 1164|  12.0k|                               chroma_stride, 2 >> chroma_subsamp_x,
 1165|  12.0k|                               2 >> chroma_subsamp_y);
 1166|       |
 1167|  12.0k|          hor_boundary_overlap(cr_line_buf + x * (2 >> chroma_subsamp_x),
 1168|  12.0k|                               chroma_stride, cr_col_buf, 2 >> chroma_subsamp_x,
 1169|  12.0k|                               cr_line_buf + x * (2 >> chroma_subsamp_x),
 1170|  12.0k|                               chroma_stride, 2 >> chroma_subsamp_x,
 1171|  12.0k|                               2 >> chroma_subsamp_y);
 1172|  12.0k|        }
 1173|       |
 1174|  19.1k|        hor_boundary_overlap(
 1175|  19.1k|            y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride,
  ------------------
  |  Branch (1175:28): [True: 12.0k, False: 7.12k]
  ------------------
 1176|  19.1k|            luma_grain_block + luma_offset_y * luma_grain_stride +
 1177|  19.1k|                luma_offset_x + (x ? 2 : 0),
  ------------------
  |  Branch (1177:34): [True: 12.0k, False: 7.12k]
  ------------------
 1178|  19.1k|            luma_grain_stride, y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride,
  ------------------
  |  Branch (1178:47): [True: 12.0k, False: 7.12k]
  ------------------
 1179|  19.1k|            AOMMIN(luma_subblock_size_x - ((x ? 1 : 0) << 1),
  ------------------
  |  |   34|  38.3k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 12.0k, False: 7.12k]
  |  |  |  Branch (34:25): [True: 12.0k, False: 7.12k]
  |  |  |  Branch (34:31): [True: 12.0k, False: 7.12k]
  |  |  |  Branch (34:38): [True: 6.86k, False: 5.18k]
  |  |  |  Branch (34:44): [True: 5.18k, False: 1.94k]
  |  |  ------------------
  ------------------
 1180|  19.1k|                   width - ((x ? x + 1 : 0) << 1)),
 1181|  19.1k|            2);
 1182|       |
 1183|  19.1k|        hor_boundary_overlap(
 1184|  19.1k|            cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
  ------------------
  |  Branch (1184:29): [True: 12.0k, False: 7.12k]
  ------------------
 1185|  19.1k|            chroma_stride,
 1186|  19.1k|            cb_grain_block + chroma_offset_y * chroma_grain_stride +
 1187|  19.1k|                chroma_offset_x + ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
  ------------------
  |  Branch (1187:37): [True: 12.0k, False: 7.12k]
  ------------------
 1188|  19.1k|            chroma_grain_stride,
 1189|  19.1k|            cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
  ------------------
  |  Branch (1189:29): [True: 12.0k, False: 7.12k]
  ------------------
 1190|  19.1k|            chroma_stride,
 1191|  19.1k|            AOMMIN(chroma_subblock_size_x -
  ------------------
  |  |   34|  38.3k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 12.0k, False: 7.12k]
  |  |  |  Branch (34:25): [True: 12.0k, False: 7.12k]
  |  |  |  Branch (34:31): [True: 12.0k, False: 7.12k]
  |  |  |  Branch (34:38): [True: 6.86k, False: 5.18k]
  |  |  |  Branch (34:44): [True: 5.18k, False: 1.94k]
  |  |  ------------------
  ------------------
 1192|  19.1k|                       ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
 1193|  19.1k|                   (width - ((x ? x + 1 : 0) << 1)) >> chroma_subsamp_x),
 1194|  19.1k|            2 >> chroma_subsamp_y);
 1195|       |
 1196|  19.1k|        hor_boundary_overlap(
 1197|  19.1k|            cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
  ------------------
  |  Branch (1197:29): [True: 12.0k, False: 7.12k]
  ------------------
 1198|  19.1k|            chroma_stride,
 1199|  19.1k|            cr_grain_block + chroma_offset_y * chroma_grain_stride +
 1200|  19.1k|                chroma_offset_x + ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
  ------------------
  |  Branch (1200:37): [True: 12.0k, False: 7.12k]
  ------------------
 1201|  19.1k|            chroma_grain_stride,
 1202|  19.1k|            cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
  ------------------
  |  Branch (1202:29): [True: 12.0k, False: 7.12k]
  ------------------
 1203|  19.1k|            chroma_stride,
 1204|  19.1k|            AOMMIN(chroma_subblock_size_x -
  ------------------
  |  |   34|  38.3k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 12.0k, False: 7.12k]
  |  |  |  Branch (34:25): [True: 12.0k, False: 7.12k]
  |  |  |  Branch (34:31): [True: 12.0k, False: 7.12k]
  |  |  |  Branch (34:38): [True: 6.86k, False: 5.18k]
  |  |  |  Branch (34:44): [True: 5.18k, False: 1.94k]
  |  |  ------------------
  ------------------
 1205|  19.1k|                       ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
 1206|  19.1k|                   (width - ((x ? x + 1 : 0) << 1)) >> chroma_subsamp_x),
 1207|  19.1k|            2 >> chroma_subsamp_y);
 1208|       |
 1209|  19.1k|        if (use_high_bit_depth) {
  ------------------
  |  Branch (1209:13): [True: 14.7k, False: 4.45k]
  ------------------
 1210|  14.7k|          add_noise_to_block_hbd(
 1211|  14.7k|              params, (uint16_t *)luma + (y << 1) * luma_stride + (x << 1),
 1212|  14.7k|              (uint16_t *)cb + (y << (1 - chroma_subsamp_y)) * chroma_stride +
 1213|  14.7k|                  (x << ((1 - chroma_subsamp_x))),
 1214|  14.7k|              (uint16_t *)cr + (y << (1 - chroma_subsamp_y)) * chroma_stride +
 1215|  14.7k|                  (x << ((1 - chroma_subsamp_x))),
 1216|  14.7k|              luma_stride, chroma_stride, y_line_buf + (x << 1),
 1217|  14.7k|              cb_line_buf + (x << (1 - chroma_subsamp_x)),
 1218|  14.7k|              cr_line_buf + (x << (1 - chroma_subsamp_x)), luma_stride,
 1219|  14.7k|              chroma_stride, 1,
 1220|  14.7k|              AOMMIN(luma_subblock_size_x >> 1, width / 2 - x), bit_depth,
  ------------------
  |  |   34|  14.7k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 10.2k, False: 4.45k]
  |  |  ------------------
  ------------------
 1221|  14.7k|              chroma_subsamp_y, chroma_subsamp_x, mc_identity);
 1222|  14.7k|        } else {
 1223|  4.45k|          add_noise_to_block(
 1224|  4.45k|              params, luma + (y << 1) * luma_stride + (x << 1),
 1225|  4.45k|              cb + (y << (1 - chroma_subsamp_y)) * chroma_stride +
 1226|  4.45k|                  (x << ((1 - chroma_subsamp_x))),
 1227|  4.45k|              cr + (y << (1 - chroma_subsamp_y)) * chroma_stride +
 1228|  4.45k|                  (x << ((1 - chroma_subsamp_x))),
 1229|  4.45k|              luma_stride, chroma_stride, y_line_buf + (x << 1),
 1230|  4.45k|              cb_line_buf + (x << (1 - chroma_subsamp_x)),
 1231|  4.45k|              cr_line_buf + (x << (1 - chroma_subsamp_x)), luma_stride,
 1232|  4.45k|              chroma_stride, 1,
 1233|  4.45k|              AOMMIN(luma_subblock_size_x >> 1, width / 2 - x), bit_depth,
  ------------------
  |  |   34|  4.45k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 1.78k, False: 2.67k]
  |  |  ------------------
  ------------------
 1234|  4.45k|              chroma_subsamp_y, chroma_subsamp_x, mc_identity);
 1235|  4.45k|        }
 1236|  19.1k|      }
 1237|       |
 1238|  53.0k|      int i = overlap && y ? 1 : 0;
  ------------------
  |  Branch (1238:15): [True: 36.8k, False: 16.2k]
  |  Branch (1238:26): [True: 19.1k, False: 17.6k]
  ------------------
 1239|  53.0k|      int j = overlap && x ? 1 : 0;
  ------------------
  |  Branch (1239:15): [True: 36.8k, False: 16.2k]
  |  Branch (1239:26): [True: 20.7k, False: 16.0k]
  ------------------
 1240|       |
 1241|  53.0k|      if (use_high_bit_depth) {
  ------------------
  |  Branch (1241:11): [True: 37.2k, False: 15.8k]
  ------------------
 1242|  37.2k|        add_noise_to_block_hbd(
 1243|  37.2k|            params,
 1244|  37.2k|            (uint16_t *)luma + ((y + i) << 1) * luma_stride + ((x + j) << 1),
 1245|  37.2k|            (uint16_t *)cb +
 1246|  37.2k|                ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
 1247|  37.2k|                ((x + j) << (1 - chroma_subsamp_x)),
 1248|  37.2k|            (uint16_t *)cr +
 1249|  37.2k|                ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
 1250|  37.2k|                ((x + j) << (1 - chroma_subsamp_x)),
 1251|  37.2k|            luma_stride, chroma_stride,
 1252|  37.2k|            luma_grain_block + (luma_offset_y + (i << 1)) * luma_grain_stride +
 1253|  37.2k|                luma_offset_x + (j << 1),
 1254|  37.2k|            cb_grain_block +
 1255|  37.2k|                (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
 1256|  37.2k|                    chroma_grain_stride +
 1257|  37.2k|                chroma_offset_x + (j << (1 - chroma_subsamp_x)),
 1258|  37.2k|            cr_grain_block +
 1259|  37.2k|                (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
 1260|  37.2k|                    chroma_grain_stride +
 1261|  37.2k|                chroma_offset_x + (j << (1 - chroma_subsamp_x)),
 1262|  37.2k|            luma_grain_stride, chroma_grain_stride,
 1263|  37.2k|            AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i,
  ------------------
  |  |   34|  37.2k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 17.0k, False: 20.2k]
  |  |  ------------------
  ------------------
 1264|  37.2k|            AOMMIN(luma_subblock_size_x >> 1, width / 2 - x) - j, bit_depth,
  ------------------
  |  |   34|  37.2k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 22.5k, False: 14.6k]
  |  |  ------------------
  ------------------
 1265|  37.2k|            chroma_subsamp_y, chroma_subsamp_x, mc_identity);
 1266|  37.2k|      } else {
 1267|  15.8k|        add_noise_to_block(
 1268|  15.8k|            params, luma + ((y + i) << 1) * luma_stride + ((x + j) << 1),
 1269|  15.8k|            cb + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
 1270|  15.8k|                ((x + j) << (1 - chroma_subsamp_x)),
 1271|  15.8k|            cr + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
 1272|  15.8k|                ((x + j) << (1 - chroma_subsamp_x)),
 1273|  15.8k|            luma_stride, chroma_stride,
 1274|  15.8k|            luma_grain_block + (luma_offset_y + (i << 1)) * luma_grain_stride +
 1275|  15.8k|                luma_offset_x + (j << 1),
 1276|  15.8k|            cb_grain_block +
 1277|  15.8k|                (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
 1278|  15.8k|                    chroma_grain_stride +
 1279|  15.8k|                chroma_offset_x + (j << (1 - chroma_subsamp_x)),
 1280|  15.8k|            cr_grain_block +
 1281|  15.8k|                (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
 1282|  15.8k|                    chroma_grain_stride +
 1283|  15.8k|                chroma_offset_x + (j << (1 - chroma_subsamp_x)),
 1284|  15.8k|            luma_grain_stride, chroma_grain_stride,
 1285|  15.8k|            AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i,
  ------------------
  |  |   34|  15.8k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 6.99k, False: 8.84k]
  |  |  ------------------
  ------------------
 1286|  15.8k|            AOMMIN(luma_subblock_size_x >> 1, width / 2 - x) - j, bit_depth,
  ------------------
  |  |   34|  15.8k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 8.00k, False: 7.83k]
  |  |  ------------------
  ------------------
 1287|  15.8k|            chroma_subsamp_y, chroma_subsamp_x, mc_identity);
 1288|  15.8k|      }
 1289|       |
 1290|  53.0k|      if (overlap) {
  ------------------
  |  Branch (1290:11): [True: 36.8k, False: 16.2k]
  ------------------
 1291|  36.8k|        if (x) {
  ------------------
  |  Branch (1291:13): [True: 20.7k, False: 16.0k]
  ------------------
 1292|       |          // Copy overlapped column bufer to line buffer
 1293|  20.7k|          copy_area(y_col_buf + (luma_subblock_size_y << 1), 2,
 1294|  20.7k|                    y_line_buf + (x << 1), luma_stride, 2, 2);
 1295|       |
 1296|  20.7k|          copy_area(
 1297|  20.7k|              cb_col_buf + (chroma_subblock_size_y << (1 - chroma_subsamp_x)),
 1298|  20.7k|              2 >> chroma_subsamp_x,
 1299|  20.7k|              cb_line_buf + (x << (1 - chroma_subsamp_x)), chroma_stride,
 1300|  20.7k|              2 >> chroma_subsamp_x, 2 >> chroma_subsamp_y);
 1301|       |
 1302|  20.7k|          copy_area(
 1303|  20.7k|              cr_col_buf + (chroma_subblock_size_y << (1 - chroma_subsamp_x)),
 1304|  20.7k|              2 >> chroma_subsamp_x,
 1305|  20.7k|              cr_line_buf + (x << (1 - chroma_subsamp_x)), chroma_stride,
 1306|  20.7k|              2 >> chroma_subsamp_x, 2 >> chroma_subsamp_y);
 1307|  20.7k|        }
 1308|       |
 1309|       |        // Copy grain to the line buffer for overlap with a bottom block
 1310|  36.8k|        copy_area(
 1311|  36.8k|            luma_grain_block +
 1312|  36.8k|                (luma_offset_y + luma_subblock_size_y) * luma_grain_stride +
 1313|  36.8k|                luma_offset_x + ((x ? 2 : 0)),
  ------------------
  |  Branch (1313:35): [True: 20.7k, False: 16.0k]
  ------------------
 1314|  36.8k|            luma_grain_stride, y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride,
  ------------------
  |  Branch (1314:47): [True: 20.7k, False: 16.0k]
  ------------------
 1315|  36.8k|            AOMMIN(luma_subblock_size_x, width - (x << 1)) - (x ? 2 : 0), 2);
  ------------------
  |  |   34|  36.8k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 20.7k, False: 16.0k]
  |  |  ------------------
  ------------------
  |  Branch (1315:63): [True: 20.7k, False: 16.0k]
  ------------------
 1316|       |
 1317|  36.8k|        copy_area(cb_grain_block +
 1318|  36.8k|                      (chroma_offset_y + chroma_subblock_size_y) *
 1319|  36.8k|                          chroma_grain_stride +
 1320|  36.8k|                      chroma_offset_x + (x ? 2 >> chroma_subsamp_x : 0),
  ------------------
  |  Branch (1320:42): [True: 20.7k, False: 16.0k]
  ------------------
 1321|  36.8k|                  chroma_grain_stride,
 1322|  36.8k|                  cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
  ------------------
  |  Branch (1322:35): [True: 20.7k, False: 16.0k]
  ------------------
 1323|  36.8k|                  chroma_stride,
 1324|  36.8k|                  AOMMIN(chroma_subblock_size_x,
  ------------------
  |  |   34|  36.8k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 20.7k, False: 16.0k]
  |  |  ------------------
  ------------------
 1325|  36.8k|                         ((width - (x << 1)) >> chroma_subsamp_x)) -
 1326|  36.8k|                      (x ? 2 >> chroma_subsamp_x : 0),
  ------------------
  |  Branch (1326:24): [True: 20.7k, False: 16.0k]
  ------------------
 1327|  36.8k|                  2 >> chroma_subsamp_y);
 1328|       |
 1329|  36.8k|        copy_area(cr_grain_block +
 1330|  36.8k|                      (chroma_offset_y + chroma_subblock_size_y) *
 1331|  36.8k|                          chroma_grain_stride +
 1332|  36.8k|                      chroma_offset_x + (x ? 2 >> chroma_subsamp_x : 0),
  ------------------
  |  Branch (1332:42): [True: 20.7k, False: 16.0k]
  ------------------
 1333|  36.8k|                  chroma_grain_stride,
 1334|  36.8k|                  cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
  ------------------
  |  Branch (1334:35): [True: 20.7k, False: 16.0k]
  ------------------
 1335|  36.8k|                  chroma_stride,
 1336|  36.8k|                  AOMMIN(chroma_subblock_size_x,
  ------------------
  |  |   34|  36.8k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 20.7k, False: 16.0k]
  |  |  ------------------
  ------------------
 1337|  36.8k|                         ((width - (x << 1)) >> chroma_subsamp_x)) -
 1338|  36.8k|                      (x ? 2 >> chroma_subsamp_x : 0),
  ------------------
  |  Branch (1338:24): [True: 20.7k, False: 16.0k]
  ------------------
 1339|  36.8k|                  2 >> chroma_subsamp_y);
 1340|       |
 1341|       |        // Copy grain to the column buffer for overlap with the next block to
 1342|       |        // the right
 1343|       |
 1344|  36.8k|        copy_area(luma_grain_block + luma_offset_y * luma_grain_stride +
 1345|  36.8k|                      luma_offset_x + luma_subblock_size_x,
 1346|  36.8k|                  luma_grain_stride, y_col_buf, 2, 2,
 1347|  36.8k|                  AOMMIN(luma_subblock_size_y + 2, height - (y << 1)));
  ------------------
  |  |   34|  36.8k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 17.9k, False: 18.8k]
  |  |  ------------------
  ------------------
 1348|       |
 1349|  36.8k|        copy_area(cb_grain_block + chroma_offset_y * chroma_grain_stride +
 1350|  36.8k|                      chroma_offset_x + chroma_subblock_size_x,
 1351|  36.8k|                  chroma_grain_stride, cb_col_buf, 2 >> chroma_subsamp_x,
 1352|  36.8k|                  2 >> chroma_subsamp_x,
 1353|  36.8k|                  AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
  ------------------
  |  |   34|  36.8k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 17.9k, False: 18.8k]
  |  |  ------------------
  ------------------
 1354|  36.8k|                         (height - (y << 1)) >> chroma_subsamp_y));
 1355|       |
 1356|  36.8k|        copy_area(cr_grain_block + chroma_offset_y * chroma_grain_stride +
 1357|  36.8k|                      chroma_offset_x + chroma_subblock_size_x,
 1358|  36.8k|                  chroma_grain_stride, cr_col_buf, 2 >> chroma_subsamp_x,
 1359|  36.8k|                  2 >> chroma_subsamp_x,
 1360|  36.8k|                  AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
  ------------------
  |  |   34|  36.8k|#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
  |  |  ------------------
  |  |  |  Branch (34:23): [True: 17.9k, False: 18.8k]
  |  |  ------------------
  ------------------
 1361|  36.8k|                         (height - (y << 1)) >> chroma_subsamp_y));
 1362|  36.8k|      }
 1363|  53.0k|    }
 1364|  22.4k|  }
 1365|       |
 1366|  14.6k|  dealloc_arrays(params, &pred_pos_luma, &pred_pos_chroma, &luma_grain_block,
 1367|  14.6k|                 &cb_grain_block, &cr_grain_block, &y_line_buf, &cb_line_buf,
 1368|  14.6k|                 &cr_line_buf, &y_col_buf, &cb_col_buf, &cr_col_buf);
 1369|  14.6k|  return 0;
 1370|  14.6k|}
grain_synthesis.c:init_arrays:
  303|  14.6k|                        int chroma_subsamp_y, int chroma_subsamp_x) {
  304|  14.6k|  *pred_pos_luma_p = NULL;
  305|  14.6k|  *pred_pos_chroma_p = NULL;
  306|  14.6k|  *luma_grain_block = NULL;
  307|  14.6k|  *cb_grain_block = NULL;
  308|  14.6k|  *cr_grain_block = NULL;
  309|  14.6k|  *y_line_buf = NULL;
  310|  14.6k|  *cb_line_buf = NULL;
  311|  14.6k|  *cr_line_buf = NULL;
  312|  14.6k|  *y_col_buf = NULL;
  313|  14.6k|  *cb_col_buf = NULL;
  314|  14.6k|  *cr_col_buf = NULL;
  315|       |
  316|  14.6k|  memset(scaling_lut_y, 0, sizeof(*scaling_lut_y) * 256);
  317|  14.6k|  memset(scaling_lut_cb, 0, sizeof(*scaling_lut_cb) * 256);
  318|  14.6k|  memset(scaling_lut_cr, 0, sizeof(*scaling_lut_cr) * 256);
  319|       |
  320|  14.6k|  int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
  321|  14.6k|  int num_pos_chroma = num_pos_luma;
  322|  14.6k|  if (params->num_y_points > 0) ++num_pos_chroma;
  ------------------
  |  Branch (322:7): [True: 12.1k, False: 2.46k]
  ------------------
  323|       |
  324|  14.6k|  int **pred_pos_luma;
  325|  14.6k|  int **pred_pos_chroma;
  326|       |
  327|  14.6k|  pred_pos_luma = (int **)aom_calloc(num_pos_luma, sizeof(*pred_pos_luma));
  328|  14.6k|  if (!pred_pos_luma) return false;
  ------------------
  |  Branch (328:7): [True: 0, False: 14.6k]
  ------------------
  329|       |
  330|  44.1k|  for (int row = 0; row < num_pos_luma; row++) {
  ------------------
  |  Branch (330:21): [True: 29.4k, False: 14.6k]
  ------------------
  331|  29.4k|    pred_pos_luma[row] = (int *)aom_malloc(sizeof(**pred_pos_luma) * 3);
  332|  29.4k|    if (!pred_pos_luma[row]) {
  ------------------
  |  Branch (332:9): [True: 0, False: 29.4k]
  ------------------
  333|      0|      dealloc_arrays(params, pred_pos_luma_p, pred_pos_chroma_p,
  334|      0|                     luma_grain_block, cb_grain_block, cr_grain_block,
  335|      0|                     y_line_buf, cb_line_buf, cr_line_buf, y_col_buf,
  336|      0|                     cb_col_buf, cr_col_buf);
  337|      0|      return false;
  338|      0|    }
  339|  29.4k|  }
  340|       |
  341|  14.6k|  pred_pos_chroma =
  342|  14.6k|      (int **)aom_calloc(num_pos_chroma, sizeof(*pred_pos_chroma));
  343|  14.6k|  if (!pred_pos_chroma) {
  ------------------
  |  Branch (343:7): [True: 0, False: 14.6k]
  ------------------
  344|      0|    dealloc_arrays(params, pred_pos_luma_p, pred_pos_chroma_p, luma_grain_block,
  345|      0|                   cb_grain_block, cr_grain_block, y_line_buf, cb_line_buf,
  346|      0|                   cr_line_buf, y_col_buf, cb_col_buf, cr_col_buf);
  347|      0|    return false;
  348|      0|  }
  349|       |
  350|  56.2k|  for (int row = 0; row < num_pos_chroma; row++) {
  ------------------
  |  Branch (350:21): [True: 41.6k, False: 14.6k]
  ------------------
  351|  41.6k|    pred_pos_chroma[row] = (int *)aom_malloc(sizeof(**pred_pos_chroma) * 3);
  352|  41.6k|    if (!pred_pos_chroma[row]) {
  ------------------
  |  Branch (352:9): [True: 0, False: 41.6k]
  ------------------
  353|      0|      dealloc_arrays(params, pred_pos_luma_p, pred_pos_chroma_p,
  354|      0|                     luma_grain_block, cb_grain_block, cr_grain_block,
  355|      0|                     y_line_buf, cb_line_buf, cr_line_buf, y_col_buf,
  356|      0|                     cb_col_buf, cr_col_buf);
  357|      0|      return false;
  358|      0|    }
  359|  41.6k|  }
  360|       |
  361|  14.6k|  int pos_ar_index = 0;
  362|       |
  363|  21.5k|  for (int row = -params->ar_coeff_lag; row < 0; row++) {
  ------------------
  |  Branch (363:41): [True: 6.92k, False: 14.6k]
  ------------------
  364|  29.4k|    for (int col = -params->ar_coeff_lag; col < params->ar_coeff_lag + 1;
  ------------------
  |  Branch (364:43): [True: 22.5k, False: 6.92k]
  ------------------
  365|  22.5k|         col++) {
  366|  22.5k|      pred_pos_luma[pos_ar_index][0] = row;
  367|  22.5k|      pred_pos_luma[pos_ar_index][1] = col;
  368|  22.5k|      pred_pos_luma[pos_ar_index][2] = 0;
  369|       |
  370|  22.5k|      pred_pos_chroma[pos_ar_index][0] = row;
  371|  22.5k|      pred_pos_chroma[pos_ar_index][1] = col;
  372|  22.5k|      pred_pos_chroma[pos_ar_index][2] = 0;
  373|  22.5k|      ++pos_ar_index;
  374|  22.5k|    }
  375|  6.92k|  }
  376|       |
  377|  21.5k|  for (int col = -params->ar_coeff_lag; col < 0; col++) {
  ------------------
  |  Branch (377:41): [True: 6.92k, False: 14.6k]
  ------------------
  378|  6.92k|    pred_pos_luma[pos_ar_index][0] = 0;
  379|  6.92k|    pred_pos_luma[pos_ar_index][1] = col;
  380|  6.92k|    pred_pos_luma[pos_ar_index][2] = 0;
  381|       |
  382|  6.92k|    pred_pos_chroma[pos_ar_index][0] = 0;
  383|  6.92k|    pred_pos_chroma[pos_ar_index][1] = col;
  384|  6.92k|    pred_pos_chroma[pos_ar_index][2] = 0;
  385|       |
  386|  6.92k|    ++pos_ar_index;
  387|  6.92k|  }
  388|       |
  389|  14.6k|  if (params->num_y_points > 0) {
  ------------------
  |  Branch (389:7): [True: 12.1k, False: 2.46k]
  ------------------
  390|  12.1k|    pred_pos_chroma[pos_ar_index][0] = 0;
  391|  12.1k|    pred_pos_chroma[pos_ar_index][1] = 0;
  392|  12.1k|    pred_pos_chroma[pos_ar_index][2] = 1;
  393|  12.1k|  }
  394|       |
  395|  14.6k|  *pred_pos_luma_p = pred_pos_luma;
  396|  14.6k|  *pred_pos_chroma_p = pred_pos_chroma;
  397|       |
  398|  14.6k|  *y_line_buf = (int *)aom_malloc(sizeof(**y_line_buf) * luma_stride * 2);
  399|  14.6k|  *cb_line_buf = (int *)aom_malloc(sizeof(**cb_line_buf) * chroma_stride *
  400|  14.6k|                                   (2 >> chroma_subsamp_y));
  401|  14.6k|  *cr_line_buf = (int *)aom_malloc(sizeof(**cr_line_buf) * chroma_stride *
  402|  14.6k|                                   (2 >> chroma_subsamp_y));
  403|       |
  404|  14.6k|  *y_col_buf =
  405|  14.6k|      (int *)aom_malloc(sizeof(**y_col_buf) * (luma_subblock_size_y + 2) * 2);
  406|  14.6k|  *cb_col_buf =
  407|  14.6k|      (int *)aom_malloc(sizeof(**cb_col_buf) *
  408|  14.6k|                        (chroma_subblock_size_y + (2 >> chroma_subsamp_y)) *
  409|  14.6k|                        (2 >> chroma_subsamp_x));
  410|  14.6k|  *cr_col_buf =
  411|  14.6k|      (int *)aom_malloc(sizeof(**cr_col_buf) *
  412|  14.6k|                        (chroma_subblock_size_y + (2 >> chroma_subsamp_y)) *
  413|  14.6k|                        (2 >> chroma_subsamp_x));
  414|       |
  415|  14.6k|  *luma_grain_block =
  416|  14.6k|      (int *)aom_malloc(sizeof(**luma_grain_block) * luma_grain_samples);
  417|  14.6k|  *cb_grain_block =
  418|  14.6k|      (int *)aom_malloc(sizeof(**cb_grain_block) * chroma_grain_samples);
  419|  14.6k|  *cr_grain_block =
  420|  14.6k|      (int *)aom_malloc(sizeof(**cr_grain_block) * chroma_grain_samples);
  421|  14.6k|  if (!(*pred_pos_luma_p && *pred_pos_chroma_p && *y_line_buf && *cb_line_buf &&
  ------------------
  |  Branch (421:9): [True: 14.6k, False: 0]
  |  Branch (421:29): [True: 14.6k, False: 0]
  |  Branch (421:51): [True: 14.6k, False: 0]
  |  Branch (421:66): [True: 14.6k, False: 0]
  ------------------
  422|  14.6k|        *cr_line_buf && *y_col_buf && *cb_col_buf && *cr_col_buf &&
  ------------------
  |  Branch (422:9): [True: 14.6k, False: 0]
  |  Branch (422:25): [True: 14.6k, False: 0]
  |  Branch (422:39): [True: 14.6k, False: 0]
  |  Branch (422:54): [True: 14.6k, False: 0]
  ------------------
  423|  14.6k|        *luma_grain_block && *cb_grain_block && *cr_grain_block)) {
  ------------------
  |  Branch (423:9): [True: 14.6k, False: 0]
  |  Branch (423:30): [True: 14.6k, False: 0]
  |  Branch (423:49): [True: 14.6k, False: 0]
  ------------------
  424|      0|    dealloc_arrays(params, pred_pos_luma_p, pred_pos_chroma_p, luma_grain_block,
  425|      0|                   cb_grain_block, cr_grain_block, y_line_buf, cb_line_buf,
  426|      0|                   cr_line_buf, y_col_buf, cb_col_buf, cr_col_buf);
  427|      0|    return false;
  428|      0|  }
  429|  14.6k|  return true;
  430|  14.6k|}
grain_synthesis.c:generate_luma_grain_block:
  460|  14.6k|    int left_pad, int top_pad, int right_pad, int bottom_pad) {
  461|  14.6k|  if (params->num_y_points == 0) {
  ------------------
  |  Branch (461:7): [True: 2.46k, False: 12.1k]
  ------------------
  462|  2.46k|    memset(luma_grain_block, 0,
  463|  2.46k|           sizeof(*luma_grain_block) * luma_block_size_y * luma_grain_stride);
  464|  2.46k|    return;
  465|  2.46k|  }
  466|       |
  467|  12.1k|  int bit_depth = params->bit_depth;
  468|  12.1k|  int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift;
  469|       |
  470|  12.1k|  int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
  471|  12.1k|  int rounding_offset = (1 << (params->ar_coeff_shift - 1));
  472|       |
  473|   900k|  for (int i = 0; i < luma_block_size_y; i++)
  ------------------
  |  Branch (473:19): [True: 888k, False: 12.1k]
  ------------------
  474|  73.7M|    for (int j = 0; j < luma_block_size_x; j++)
  ------------------
  |  Branch (474:21): [True: 72.8M, False: 888k]
  ------------------
  475|  72.8M|      luma_grain_block[i * luma_grain_stride + j] =
  476|  72.8M|          (gaussian_sequence[get_random_number(gauss_bits)] +
  477|  72.8M|           ((1 << gauss_sec_shift) >> 1)) >>
  478|  72.8M|          gauss_sec_shift;
  479|       |
  480|   864k|  for (int i = top_pad; i < luma_block_size_y - bottom_pad; i++)
  ------------------
  |  Branch (480:25): [True: 851k, False: 12.1k]
  ------------------
  481|  65.5M|    for (int j = left_pad; j < luma_block_size_x - right_pad; j++) {
  ------------------
  |  Branch (481:28): [True: 64.7M, False: 851k]
  ------------------
  482|  64.7M|      int wsum = 0;
  483|   199M|      for (int pos = 0; pos < num_pos_luma; pos++) {
  ------------------
  |  Branch (483:25): [True: 134M, False: 64.7M]
  ------------------
  484|   134M|        wsum = wsum + params->ar_coeffs_y[pos] *
  485|   134M|                          luma_grain_block[(i + pred_pos_luma[pos][0]) *
  486|   134M|                                               luma_grain_stride +
  487|   134M|                                           j + pred_pos_luma[pos][1]];
  488|   134M|      }
  489|  64.7M|      luma_grain_block[i * luma_grain_stride + j] =
  490|  64.7M|          clamp(luma_grain_block[i * luma_grain_stride + j] +
  491|  64.7M|                    ((wsum + rounding_offset) >> params->ar_coeff_shift),
  492|  64.7M|                grain_min, grain_max);
  493|  64.7M|    }
  494|  12.1k|}
grain_synthesis.c:generate_chroma_grain_blocks:
  501|  14.6k|    int bottom_pad, int chroma_subsamp_y, int chroma_subsamp_x) {
  502|  14.6k|  int bit_depth = params->bit_depth;
  503|  14.6k|  int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift;
  504|       |
  505|  14.6k|  int num_pos_chroma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
  506|  14.6k|  if (params->num_y_points > 0) ++num_pos_chroma;
  ------------------
  |  Branch (506:7): [True: 12.1k, False: 2.46k]
  ------------------
  507|  14.6k|  int rounding_offset = (1 << (params->ar_coeff_shift - 1));
  508|  14.6k|  int chroma_grain_block_size = chroma_block_size_y * chroma_grain_stride;
  509|       |
  510|  14.6k|  if (params->num_cb_points || params->chroma_scaling_from_luma) {
  ------------------
  |  Branch (510:7): [True: 1.64k, False: 12.9k]
  |  Branch (510:32): [True: 8.35k, False: 4.63k]
  ------------------
  511|  9.99k|    init_random_generator(7 << 5, params->random_seed);
  512|       |
  513|   501k|    for (int i = 0; i < chroma_block_size_y; i++)
  ------------------
  |  Branch (513:21): [True: 491k, False: 9.99k]
  ------------------
  514|  24.6M|      for (int j = 0; j < chroma_block_size_x; j++)
  ------------------
  |  Branch (514:23): [True: 24.1M, False: 491k]
  ------------------
  515|  24.1M|        cb_grain_block[i * chroma_grain_stride + j] =
  516|  24.1M|            (gaussian_sequence[get_random_number(gauss_bits)] +
  517|  24.1M|             ((1 << gauss_sec_shift) >> 1)) >>
  518|  24.1M|            gauss_sec_shift;
  519|  9.99k|  } else {
  520|  4.63k|    memset(cb_grain_block, 0,
  521|  4.63k|           sizeof(*cb_grain_block) * chroma_grain_block_size);
  522|  4.63k|  }
  523|       |
  524|  14.6k|  if (params->num_cr_points || params->chroma_scaling_from_luma) {
  ------------------
  |  Branch (524:7): [True: 4.16k, False: 10.4k]
  |  Branch (524:32): [True: 8.35k, False: 2.11k]
  ------------------
  525|  12.5k|    init_random_generator(11 << 5, params->random_seed);
  526|       |
  527|   687k|    for (int i = 0; i < chroma_block_size_y; i++)
  ------------------
  |  Branch (527:21): [True: 675k, False: 12.5k]
  ------------------
  528|  40.1M|      for (int j = 0; j < chroma_block_size_x; j++)
  ------------------
  |  Branch (528:23): [True: 39.4M, False: 675k]
  ------------------
  529|  39.4M|        cr_grain_block[i * chroma_grain_stride + j] =
  530|  39.4M|            (gaussian_sequence[get_random_number(gauss_bits)] +
  531|  39.4M|             ((1 << gauss_sec_shift) >> 1)) >>
  532|  39.4M|            gauss_sec_shift;
  533|  12.5k|  } else {
  534|  2.11k|    memset(cr_grain_block, 0,
  535|  2.11k|           sizeof(*cr_grain_block) * chroma_grain_block_size);
  536|  2.11k|  }
  537|       |
  538|   789k|  for (int i = top_pad; i < chroma_block_size_y - bottom_pad; i++)
  ------------------
  |  Branch (538:25): [True: 774k, False: 14.6k]
  ------------------
  539|  43.9M|    for (int j = left_pad; j < chroma_block_size_x - right_pad; j++) {
  ------------------
  |  Branch (539:28): [True: 43.1M, False: 774k]
  ------------------
  540|  43.1M|      int wsum_cb = 0;
  541|  43.1M|      int wsum_cr = 0;
  542|   127M|      for (int pos = 0; pos < num_pos_chroma; pos++) {
  ------------------
  |  Branch (542:25): [True: 83.9M, False: 43.1M]
  ------------------
  543|  83.9M|        if (pred_pos_chroma[pos][2] == 0) {
  ------------------
  |  Branch (543:13): [True: 46.9M, False: 36.9M]
  ------------------
  544|  46.9M|          wsum_cb = wsum_cb + params->ar_coeffs_cb[pos] *
  545|  46.9M|                                  cb_grain_block[(i + pred_pos_chroma[pos][0]) *
  546|  46.9M|                                                     chroma_grain_stride +
  547|  46.9M|                                                 j + pred_pos_chroma[pos][1]];
  548|  46.9M|          wsum_cr = wsum_cr + params->ar_coeffs_cr[pos] *
  549|  46.9M|                                  cr_grain_block[(i + pred_pos_chroma[pos][0]) *
  550|  46.9M|                                                     chroma_grain_stride +
  551|  46.9M|                                                 j + pred_pos_chroma[pos][1]];
  552|  46.9M|        } else if (pred_pos_chroma[pos][2] == 1) {
  ------------------
  |  Branch (552:20): [True: 36.9M, False: 0]
  ------------------
  553|  36.9M|          int av_luma = 0;
  554|  36.9M|          int luma_coord_y = ((i - top_pad) << chroma_subsamp_y) + top_pad;
  555|  36.9M|          int luma_coord_x = ((j - left_pad) << chroma_subsamp_x) + left_pad;
  556|       |
  557|  83.0M|          for (int k = luma_coord_y; k < luma_coord_y + chroma_subsamp_y + 1;
  ------------------
  |  Branch (557:38): [True: 46.0M, False: 36.9M]
  ------------------
  558|  46.0M|               k++)
  559|   110M|            for (int l = luma_coord_x; l < luma_coord_x + chroma_subsamp_x + 1;
  ------------------
  |  Branch (559:40): [True: 64.7M, False: 46.0M]
  ------------------
  560|  64.7M|                 l++)
  561|  64.7M|              av_luma += luma_grain_block[k * luma_grain_stride + l];
  562|       |
  563|  36.9M|          av_luma =
  564|  36.9M|              (av_luma + ((1 << (chroma_subsamp_y + chroma_subsamp_x)) >> 1)) >>
  565|  36.9M|              (chroma_subsamp_y + chroma_subsamp_x);
  566|       |
  567|  36.9M|          wsum_cb = wsum_cb + params->ar_coeffs_cb[pos] * av_luma;
  568|  36.9M|          wsum_cr = wsum_cr + params->ar_coeffs_cr[pos] * av_luma;
  569|  36.9M|        } else {
  570|      0|          fprintf(
  571|      0|              stderr,
  572|      0|              "Grain synthesis: prediction between two chroma components is "
  573|      0|              "not supported!");
  574|      0|          return false;
  575|      0|        }
  576|  83.9M|      }
  577|  43.1M|      if (params->num_cb_points || params->chroma_scaling_from_luma)
  ------------------
  |  Branch (577:11): [True: 4.50M, False: 38.6M]
  |  Branch (577:36): [True: 15.4M, False: 23.2M]
  ------------------
  578|  19.9M|        cb_grain_block[i * chroma_grain_stride + j] =
  579|  19.9M|            clamp(cb_grain_block[i * chroma_grain_stride + j] +
  580|  19.9M|                      ((wsum_cb + rounding_offset) >> params->ar_coeff_shift),
  581|  19.9M|                  grain_min, grain_max);
  582|  43.1M|      if (params->num_cr_points || params->chroma_scaling_from_luma)
  ------------------
  |  Branch (582:11): [True: 18.1M, False: 25.0M]
  |  Branch (582:36): [True: 15.4M, False: 9.60M]
  ------------------
  583|  33.5M|        cr_grain_block[i * chroma_grain_stride + j] =
  584|  33.5M|            clamp(cr_grain_block[i * chroma_grain_stride + j] +
  585|  33.5M|                      ((wsum_cr + rounding_offset) >> params->ar_coeff_shift),
  586|  33.5M|                  grain_min, grain_max);
  587|  43.1M|    }
  588|  14.6k|  return true;
  589|  14.6k|}
grain_synthesis.c:init_scaling_function:
  592|  27.2k|                                  int scaling_lut[]) {
  593|  27.2k|  if (num_points == 0) return;
  ------------------
  |  Branch (593:7): [True: 9.21k, False: 17.9k]
  ------------------
  594|       |
  595|   579k|  for (int i = 0; i < scaling_points[0][0]; i++)
  ------------------
  |  Branch (595:19): [True: 561k, False: 17.9k]
  ------------------
  596|   561k|    scaling_lut[i] = scaling_points[0][1];
  597|       |
  598|  40.6k|  for (int point = 0; point < num_points - 1; point++) {
  ------------------
  |  Branch (598:23): [True: 22.6k, False: 17.9k]
  ------------------
  599|  22.6k|    int delta_y = scaling_points[point + 1][1] - scaling_points[point][1];
  600|  22.6k|    int delta_x = scaling_points[point + 1][0] - scaling_points[point][0];
  601|       |
  602|  22.6k|    int64_t delta = delta_y * ((65536 + (delta_x >> 1)) / delta_x);
  603|       |
  604|  2.59M|    for (int x = 0; x < delta_x; x++) {
  ------------------
  |  Branch (604:21): [True: 2.57M, False: 22.6k]
  ------------------
  605|  2.57M|      scaling_lut[scaling_points[point][0] + x] =
  606|  2.57M|          scaling_points[point][1] + (int)((x * delta + 32768) >> 16);
  607|  2.57M|    }
  608|  22.6k|  }
  609|       |
  610|  1.49M|  for (int i = scaling_points[num_points - 1][0]; i < 256; i++)
  ------------------
  |  Branch (610:51): [True: 1.47M, False: 17.9k]
  ------------------
  611|  1.47M|    scaling_lut[i] = scaling_points[num_points - 1][1];
  612|  17.9k|}
grain_synthesis.c:init_random_generator:
  442|  45.0k|static void init_random_generator(int luma_line, uint16_t seed) {
  443|       |  // same for the picture
  444|       |
  445|  45.0k|  uint16_t msb = (seed >> 8) & 255;
  446|  45.0k|  uint16_t lsb = seed & 255;
  447|       |
  448|  45.0k|  random_register = (msb << 8) + lsb;
  449|       |
  450|       |  //  changes for each row
  451|  45.0k|  int luma_num = luma_line >> 5;
  452|       |
  453|  45.0k|  random_register ^= ((luma_num * 37 + 178) & 255) << 8;
  454|  45.0k|  random_register ^= ((luma_num * 173 + 105) & 255);
  455|  45.0k|}
grain_synthesis.c:get_random_number:
  433|   136M|static inline int get_random_number(int bits) {
  434|   136M|  uint16_t bit;
  435|   136M|  bit = ((random_register >> 0) ^ (random_register >> 1) ^
  436|   136M|         (random_register >> 3) ^ (random_register >> 12)) &
  437|   136M|        1;
  438|   136M|  random_register = (random_register >> 1) | (bit << 15);
  439|   136M|  return (random_register >> (16 - bits)) & ((1 << bits) - 1);
  440|   136M|}
grain_synthesis.c:ver_boundary_overlap:
  915|  62.3k|                                 int height) {
  916|  62.3k|  if (width == 1) {
  ------------------
  |  Branch (916:7): [True: 16.4k, False: 45.8k]
  ------------------
  917|   178k|    while (height) {
  ------------------
  |  Branch (917:12): [True: 161k, False: 16.4k]
  ------------------
  918|   161k|      *dst_block = clamp((*left_block * 23 + *right_block * 22 + 16) >> 5,
  919|   161k|                         grain_min, grain_max);
  920|   161k|      left_block += left_stride;
  921|   161k|      right_block += right_stride;
  922|   161k|      dst_block += dst_stride;
  923|   161k|      --height;
  924|   161k|    }
  925|  16.4k|    return;
  926|  45.8k|  } else if (width == 2) {
  ------------------
  |  Branch (926:14): [True: 45.8k, False: 0]
  ------------------
  927|  1.12M|    while (height) {
  ------------------
  |  Branch (927:12): [True: 1.08M, False: 45.8k]
  ------------------
  928|  1.08M|      dst_block[0] = clamp((27 * left_block[0] + 17 * right_block[0] + 16) >> 5,
  929|  1.08M|                           grain_min, grain_max);
  930|  1.08M|      dst_block[1] = clamp((17 * left_block[1] + 27 * right_block[1] + 16) >> 5,
  931|  1.08M|                           grain_min, grain_max);
  932|  1.08M|      left_block += left_stride;
  933|  1.08M|      right_block += right_stride;
  934|  1.08M|      dst_block += dst_stride;
  935|  1.08M|      --height;
  936|  1.08M|    }
  937|  45.8k|    return;
  938|  45.8k|  }
  939|  62.3k|}
grain_synthesis.c:add_noise_to_block_hbd:
  748|  69.2k|    int chroma_subsamp_y, int chroma_subsamp_x, int mc_identity) {
  749|  69.2k|  int cb_mult = params->cb_mult - 128;            // fixed scale
  750|  69.2k|  int cb_luma_mult = params->cb_luma_mult - 128;  // fixed scale
  751|       |  // offset value depends on the bit depth
  752|  69.2k|  int cb_offset = (params->cb_offset << (bit_depth - 8)) - (1 << bit_depth);
  753|       |
  754|  69.2k|  int cr_mult = params->cr_mult - 128;            // fixed scale
  755|  69.2k|  int cr_luma_mult = params->cr_luma_mult - 128;  // fixed scale
  756|       |  // offset value depends on the bit depth
  757|  69.2k|  int cr_offset = (params->cr_offset << (bit_depth - 8)) - (1 << bit_depth);
  758|       |
  759|  69.2k|  int rounding_offset = (1 << (params->scaling_shift - 1));
  760|       |
  761|  69.2k|  int apply_y = params->num_y_points > 0 ? 1 : 0;
  ------------------
  |  Branch (761:17): [True: 66.7k, False: 2.52k]
  ------------------
  762|  69.2k|  int apply_cb =
  763|  69.2k|      (params->num_cb_points > 0 || params->chroma_scaling_from_luma) > 0 ? 1
  ------------------
  |  Branch (763:7): [True: 33.3k, False: 35.9k]
  |  Branch (763:8): [True: 5.03k, False: 64.2k]
  |  Branch (763:37): [True: 28.2k, False: 35.9k]
  ------------------
  764|  69.2k|                                                                          : 0;
  765|  69.2k|  int apply_cr =
  766|  69.2k|      (params->num_cr_points > 0 || params->chroma_scaling_from_luma) > 0 ? 1
  ------------------
  |  Branch (766:7): [True: 53.8k, False: 15.4k]
  |  Branch (766:8): [True: 25.5k, False: 43.7k]
  |  Branch (766:37): [True: 28.2k, False: 15.4k]
  ------------------
  767|  69.2k|                                                                          : 0;
  768|       |
  769|  69.2k|  if (params->chroma_scaling_from_luma) {
  ------------------
  |  Branch (769:7): [True: 28.2k, False: 40.9k]
  ------------------
  770|  28.2k|    cb_mult = 0;        // fixed scale
  771|  28.2k|    cb_luma_mult = 64;  // fixed scale
  772|  28.2k|    cb_offset = 0;
  773|       |
  774|  28.2k|    cr_mult = 0;        // fixed scale
  775|  28.2k|    cr_luma_mult = 64;  // fixed scale
  776|  28.2k|    cr_offset = 0;
  777|  28.2k|  }
  778|       |
  779|  69.2k|  int min_luma, max_luma, min_chroma, max_chroma;
  780|       |
  781|  69.2k|  if (params->clip_to_restricted_range) {
  ------------------
  |  Branch (781:7): [True: 11.5k, False: 57.7k]
  ------------------
  782|  11.5k|    min_luma = min_luma_legal_range << (bit_depth - 8);
  783|  11.5k|    max_luma = max_luma_legal_range << (bit_depth - 8);
  784|       |
  785|  11.5k|    if (mc_identity) {
  ------------------
  |  Branch (785:9): [True: 763, False: 10.7k]
  ------------------
  786|    763|      min_chroma = min_luma_legal_range << (bit_depth - 8);
  787|    763|      max_chroma = max_luma_legal_range << (bit_depth - 8);
  788|  10.7k|    } else {
  789|  10.7k|      min_chroma = min_chroma_legal_range << (bit_depth - 8);
  790|  10.7k|      max_chroma = max_chroma_legal_range << (bit_depth - 8);
  791|  10.7k|    }
  792|  57.7k|  } else {
  793|  57.7k|    min_luma = min_chroma = 0;
  794|  57.7k|    max_luma = max_chroma = (256 << (bit_depth - 8)) - 1;
  795|  57.7k|  }
  796|       |
  797|   960k|  for (int i = 0; i < (half_luma_height << (1 - chroma_subsamp_y)); i++) {
  ------------------
  |  Branch (797:19): [True: 890k, False: 69.2k]
  ------------------
  798|  13.4M|    for (int j = 0; j < (half_luma_width << (1 - chroma_subsamp_x)); j++) {
  ------------------
  |  Branch (798:21): [True: 12.5M, False: 890k]
  ------------------
  799|  12.5M|      int average_luma = 0;
  800|  12.5M|      if (chroma_subsamp_x) {
  ------------------
  |  Branch (800:11): [True: 1.58M, False: 11.0M]
  ------------------
  801|  1.58M|        average_luma = (luma[(i << chroma_subsamp_y) * luma_stride +
  802|  1.58M|                             (j << chroma_subsamp_x)] +
  803|  1.58M|                        luma[(i << chroma_subsamp_y) * luma_stride +
  804|  1.58M|                             (j << chroma_subsamp_x) + 1] +
  805|  1.58M|                        1) >>
  806|  1.58M|                       1;
  807|  11.0M|      } else {
  808|  11.0M|        average_luma = luma[(i << chroma_subsamp_y) * luma_stride + j];
  809|  11.0M|      }
  810|       |
  811|  12.5M|      if (apply_cb) {
  ------------------
  |  Branch (811:11): [True: 6.10M, False: 6.48M]
  ------------------
  812|  6.10M|        cb[i * chroma_stride + j] = clamp(
  813|  6.10M|            cb[i * chroma_stride + j] +
  814|  6.10M|                ((scale_LUT(scaling_lut_cb,
  815|  6.10M|                            clamp(((average_luma * cb_luma_mult +
  816|  6.10M|                                    cb_mult * cb[i * chroma_stride + j]) >>
  817|  6.10M|                                   6) +
  818|  6.10M|                                      cb_offset,
  819|  6.10M|                                  0, (256 << (bit_depth - 8)) - 1),
  820|  6.10M|                            bit_depth) *
  821|  6.10M|                      cb_grain[i * chroma_grain_stride + j] +
  822|  6.10M|                  rounding_offset) >>
  823|  6.10M|                 params->scaling_shift),
  824|  6.10M|            min_chroma, max_chroma);
  825|  6.10M|      }
  826|  12.5M|      if (apply_cr) {
  ------------------
  |  Branch (826:11): [True: 10.2M, False: 2.37M]
  ------------------
  827|  10.2M|        cr[i * chroma_stride + j] = clamp(
  828|  10.2M|            cr[i * chroma_stride + j] +
  829|  10.2M|                ((scale_LUT(scaling_lut_cr,
  830|  10.2M|                            clamp(((average_luma * cr_luma_mult +
  831|  10.2M|                                    cr_mult * cr[i * chroma_stride + j]) >>
  832|  10.2M|                                   6) +
  833|  10.2M|                                      cr_offset,
  834|  10.2M|                                  0, (256 << (bit_depth - 8)) - 1),
  835|  10.2M|                            bit_depth) *
  836|  10.2M|                      cr_grain[i * chroma_grain_stride + j] +
  837|  10.2M|                  rounding_offset) >>
  838|  10.2M|                 params->scaling_shift),
  839|  10.2M|            min_chroma, max_chroma);
  840|  10.2M|      }
  841|  12.5M|    }
  842|   890k|  }
  843|       |
  844|  69.2k|  if (apply_y) {
  ------------------
  |  Branch (844:7): [True: 66.7k, False: 2.52k]
  ------------------
  845|  1.04M|    for (int i = 0; i < (half_luma_height << 1); i++) {
  ------------------
  |  Branch (845:21): [True: 980k, False: 66.7k]
  ------------------
  846|  16.3M|      for (int j = 0; j < (half_luma_width << 1); j++) {
  ------------------
  |  Branch (846:23): [True: 15.3M, False: 980k]
  ------------------
  847|  15.3M|        luma[i * luma_stride + j] =
  848|  15.3M|            clamp(luma[i * luma_stride + j] +
  849|  15.3M|                      ((scale_LUT(scaling_lut_y, luma[i * luma_stride + j],
  850|  15.3M|                                  bit_depth) *
  851|  15.3M|                            luma_grain[i * luma_grain_stride + j] +
  852|  15.3M|                        rounding_offset) >>
  853|  15.3M|                       params->scaling_shift),
  854|  15.3M|                  min_luma, max_luma);
  855|  15.3M|      }
  856|   980k|    }
  857|  66.7k|  }
  858|  69.2k|}
grain_synthesis.c:scale_LUT:
  616|  36.8M|static int scale_LUT(int *scaling_lut, int index, int bit_depth) {
  617|  36.8M|  int x = index >> (bit_depth - 8);
  618|       |
  619|  36.8M|  if (!(bit_depth - 8) || x == 255)
  ------------------
  |  Branch (619:7): [True: 5.11M, False: 31.7M]
  |  Branch (619:27): [True: 240k, False: 31.4M]
  ------------------
  620|  5.35M|    return scaling_lut[x];
  621|  31.4M|  else
  622|  31.4M|    return scaling_lut[x] + (((scaling_lut[x + 1] - scaling_lut[x]) *
  623|  31.4M|                                  (index & ((1 << (bit_depth - 8)) - 1)) +
  624|  31.4M|                              (1 << (bit_depth - 9))) >>
  625|  31.4M|                             (bit_depth - 8));
  626|  36.8M|}
grain_synthesis.c:add_noise_to_block:
  635|  23.7k|                               int chroma_subsamp_x, int mc_identity) {
  636|  23.7k|  int cb_mult = params->cb_mult - 128;            // fixed scale
  637|  23.7k|  int cb_luma_mult = params->cb_luma_mult - 128;  // fixed scale
  638|  23.7k|  int cb_offset = params->cb_offset - 256;
  639|       |
  640|  23.7k|  int cr_mult = params->cr_mult - 128;            // fixed scale
  641|  23.7k|  int cr_luma_mult = params->cr_luma_mult - 128;  // fixed scale
  642|  23.7k|  int cr_offset = params->cr_offset - 256;
  643|       |
  644|  23.7k|  int rounding_offset = (1 << (params->scaling_shift - 1));
  645|       |
  646|  23.7k|  int apply_y = params->num_y_points > 0 ? 1 : 0;
  ------------------
  |  Branch (646:17): [True: 16.8k, False: 6.91k]
  ------------------
  647|  23.7k|  int apply_cb =
  648|  23.7k|      (params->num_cb_points > 0 || params->chroma_scaling_from_luma) ? 1 : 0;
  ------------------
  |  Branch (648:8): [True: 5.97k, False: 17.7k]
  |  Branch (648:37): [True: 6.40k, False: 11.3k]
  ------------------
  649|  23.7k|  int apply_cr =
  650|  23.7k|      (params->num_cr_points > 0 || params->chroma_scaling_from_luma) ? 1 : 0;
  ------------------
  |  Branch (650:8): [True: 6.01k, False: 17.7k]
  |  Branch (650:37): [True: 6.40k, False: 11.3k]
  ------------------
  651|       |
  652|  23.7k|  if (params->chroma_scaling_from_luma) {
  ------------------
  |  Branch (652:7): [True: 6.40k, False: 17.3k]
  ------------------
  653|  6.40k|    cb_mult = 0;        // fixed scale
  654|  6.40k|    cb_luma_mult = 64;  // fixed scale
  655|  6.40k|    cb_offset = 0;
  656|       |
  657|  6.40k|    cr_mult = 0;        // fixed scale
  658|  6.40k|    cr_luma_mult = 64;  // fixed scale
  659|  6.40k|    cr_offset = 0;
  660|  6.40k|  }
  661|       |
  662|  23.7k|  int min_luma, max_luma, min_chroma, max_chroma;
  663|       |
  664|  23.7k|  if (params->clip_to_restricted_range) {
  ------------------
  |  Branch (664:7): [True: 11.9k, False: 11.8k]
  ------------------
  665|  11.9k|    min_luma = min_luma_legal_range;
  666|  11.9k|    max_luma = max_luma_legal_range;
  667|       |
  668|  11.9k|    if (mc_identity) {
  ------------------
  |  Branch (668:9): [True: 718, False: 11.2k]
  ------------------
  669|    718|      min_chroma = min_luma_legal_range;
  670|    718|      max_chroma = max_luma_legal_range;
  671|  11.2k|    } else {
  672|  11.2k|      min_chroma = min_chroma_legal_range;
  673|  11.2k|      max_chroma = max_chroma_legal_range;
  674|  11.2k|    }
  675|  11.9k|  } else {
  676|  11.8k|    min_luma = min_chroma = 0;
  677|  11.8k|    max_luma = max_chroma = 255;
  678|  11.8k|  }
  679|       |
  680|   242k|  for (int i = 0; i < (half_luma_height << (1 - chroma_subsamp_y)); i++) {
  ------------------
  |  Branch (680:19): [True: 218k, False: 23.7k]
  ------------------
  681|  2.85M|    for (int j = 0; j < (half_luma_width << (1 - chroma_subsamp_x)); j++) {
  ------------------
  |  Branch (681:21): [True: 2.63M, False: 218k]
  ------------------
  682|  2.63M|      int average_luma = 0;
  683|  2.63M|      if (chroma_subsamp_x) {
  ------------------
  |  Branch (683:11): [True: 1.61M, False: 1.02M]
  ------------------
  684|  1.61M|        average_luma = (luma[(i << chroma_subsamp_y) * luma_stride +
  685|  1.61M|                             (j << chroma_subsamp_x)] +
  686|  1.61M|                        luma[(i << chroma_subsamp_y) * luma_stride +
  687|  1.61M|                             (j << chroma_subsamp_x) + 1] +
  688|  1.61M|                        1) >>
  689|  1.61M|                       1;
  690|  1.61M|      } else {
  691|  1.02M|        average_luma = luma[(i << chroma_subsamp_y) * luma_stride + j];
  692|  1.02M|      }
  693|       |
  694|  2.63M|      if (apply_cb) {
  ------------------
  |  Branch (694:11): [True: 603k, False: 2.03M]
  ------------------
  695|   603k|        cb[i * chroma_stride + j] = clamp(
  696|   603k|            cb[i * chroma_stride + j] +
  697|   603k|                ((scale_LUT(scaling_lut_cb,
  698|   603k|                            clamp(((average_luma * cb_luma_mult +
  699|   603k|                                    cb_mult * cb[i * chroma_stride + j]) >>
  700|   603k|                                   6) +
  701|   603k|                                      cb_offset,
  702|   603k|                                  0, (256 << (bit_depth - 8)) - 1),
  703|   603k|                            8) *
  704|   603k|                      cb_grain[i * chroma_grain_stride + j] +
  705|   603k|                  rounding_offset) >>
  706|   603k|                 params->scaling_shift),
  707|   603k|            min_chroma, max_chroma);
  708|   603k|      }
  709|       |
  710|  2.63M|      if (apply_cr) {
  ------------------
  |  Branch (710:11): [True: 609k, False: 2.03M]
  ------------------
  711|   609k|        cr[i * chroma_stride + j] = clamp(
  712|   609k|            cr[i * chroma_stride + j] +
  713|   609k|                ((scale_LUT(scaling_lut_cr,
  714|   609k|                            clamp(((average_luma * cr_luma_mult +
  715|   609k|                                    cr_mult * cr[i * chroma_stride + j]) >>
  716|   609k|                                   6) +
  717|   609k|                                      cr_offset,
  718|   609k|                                  0, (256 << (bit_depth - 8)) - 1),
  719|   609k|                            8) *
  720|   609k|                      cr_grain[i * chroma_grain_stride + j] +
  721|   609k|                  rounding_offset) >>
  722|   609k|                 params->scaling_shift),
  723|   609k|            min_chroma, max_chroma);
  724|   609k|      }
  725|  2.63M|    }
  726|   218k|  }
  727|       |
  728|  23.7k|  if (apply_y) {
  ------------------
  |  Branch (728:7): [True: 16.8k, False: 6.91k]
  ------------------
  729|   250k|    for (int i = 0; i < (half_luma_height << 1); i++) {
  ------------------
  |  Branch (729:21): [True: 233k, False: 16.8k]
  ------------------
  730|  4.13M|      for (int j = 0; j < (half_luma_width << 1); j++) {
  ------------------
  |  Branch (730:23): [True: 3.89M, False: 233k]
  ------------------
  731|  3.89M|        luma[i * luma_stride + j] =
  732|  3.89M|            clamp(luma[i * luma_stride + j] +
  733|  3.89M|                      ((scale_LUT(scaling_lut_y, luma[i * luma_stride + j], 8) *
  734|  3.89M|                            luma_grain[i * luma_grain_stride + j] +
  735|  3.89M|                        rounding_offset) >>
  736|  3.89M|                       params->scaling_shift),
  737|  3.89M|                  min_luma, max_luma);
  738|  3.89M|      }
  739|   233k|    }
  740|  16.8k|  }
  741|  23.7k|}
grain_synthesis.c:hor_boundary_overlap:
  944|  93.6k|                                 int height) {
  945|  93.6k|  if (height == 1) {
  ------------------
  |  Branch (945:7): [True: 17.2k, False: 76.4k]
  ------------------
  946|   161k|    while (width) {
  ------------------
  |  Branch (946:12): [True: 144k, False: 17.2k]
  ------------------
  947|   144k|      *dst_block = clamp((*top_block * 23 + *bottom_block * 22 + 16) >> 5,
  948|   144k|                         grain_min, grain_max);
  949|   144k|      ++top_block;
  950|   144k|      ++bottom_block;
  951|   144k|      ++dst_block;
  952|   144k|      --width;
  953|   144k|    }
  954|  17.2k|    return;
  955|  76.4k|  } else if (height == 2) {
  ------------------
  |  Branch (955:14): [True: 76.4k, False: 0]
  ------------------
  956|  1.22M|    while (width) {
  ------------------
  |  Branch (956:12): [True: 1.14M, False: 76.4k]
  ------------------
  957|  1.14M|      dst_block[0] = clamp((27 * top_block[0] + 17 * bottom_block[0] + 16) >> 5,
  958|  1.14M|                           grain_min, grain_max);
  959|  1.14M|      dst_block[dst_stride] = clamp((17 * top_block[top_stride] +
  960|  1.14M|                                     27 * bottom_block[bottom_stride] + 16) >>
  961|  1.14M|                                        5,
  962|  1.14M|                                    grain_min, grain_max);
  963|  1.14M|      ++top_block;
  964|  1.14M|      ++bottom_block;
  965|  1.14M|      ++dst_block;
  966|  1.14M|      --width;
  967|  1.14M|    }
  968|  76.4k|    return;
  969|  76.4k|  }
  970|  93.6k|}
grain_synthesis.c:copy_area:
  874|   283k|                      int width, int height) {
  875|  2.66M|  while (height) {
  ------------------
  |  Branch (875:10): [True: 2.37M, False: 283k]
  ------------------
  876|  2.37M|    memcpy(dst, src, width * sizeof(*src));
  877|  2.37M|    src += src_stride;
  878|  2.37M|    dst += dst_stride;
  879|  2.37M|    --height;
  880|  2.37M|  }
  881|   283k|  return;
  882|   283k|}
grain_synthesis.c:dealloc_arrays:
  247|  14.6k|                           int **cr_col_buf) {
  248|  14.6k|  int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
  249|  14.6k|  int num_pos_chroma = num_pos_luma;
  250|  14.6k|  if (params->num_y_points > 0) ++num_pos_chroma;
  ------------------
  |  Branch (250:7): [True: 12.1k, False: 2.46k]
  ------------------
  251|       |
  252|  14.6k|  if (*pred_pos_luma) {
  ------------------
  |  Branch (252:7): [True: 14.6k, False: 0]
  ------------------
  253|  44.1k|    for (int row = 0; row < num_pos_luma; row++) {
  ------------------
  |  Branch (253:23): [True: 29.4k, False: 14.6k]
  ------------------
  254|  29.4k|      aom_free((*pred_pos_luma)[row]);
  255|  29.4k|    }
  256|  14.6k|    aom_free(*pred_pos_luma);
  257|  14.6k|    *pred_pos_luma = NULL;
  258|  14.6k|  }
  259|       |
  260|  14.6k|  if (*pred_pos_chroma) {
  ------------------
  |  Branch (260:7): [True: 14.6k, False: 0]
  ------------------
  261|  56.2k|    for (int row = 0; row < num_pos_chroma; row++) {
  ------------------
  |  Branch (261:23): [True: 41.6k, False: 14.6k]
  ------------------
  262|  41.6k|      aom_free((*pred_pos_chroma)[row]);
  263|  41.6k|    }
  264|  14.6k|    aom_free(*pred_pos_chroma);
  265|  14.6k|    *pred_pos_chroma = NULL;
  266|  14.6k|  }
  267|       |
  268|  14.6k|  aom_free(*y_line_buf);
  269|  14.6k|  *y_line_buf = NULL;
  270|       |
  271|  14.6k|  aom_free(*cb_line_buf);
  272|  14.6k|  *cb_line_buf = NULL;
  273|       |
  274|  14.6k|  aom_free(*cr_line_buf);
  275|  14.6k|  *cr_line_buf = NULL;
  276|       |
  277|  14.6k|  aom_free(*y_col_buf);
  278|  14.6k|  *y_col_buf = NULL;
  279|       |
  280|  14.6k|  aom_free(*cb_col_buf);
  281|  14.6k|  *cb_col_buf = NULL;
  282|       |
  283|  14.6k|  aom_free(*cr_col_buf);
  284|  14.6k|  *cr_col_buf = NULL;
  285|       |
  286|  14.6k|  aom_free(*luma_grain_block);
  287|  14.6k|  *luma_grain_block = NULL;
  288|       |
  289|  14.6k|  aom_free(*cb_grain_block);
  290|  14.6k|  *cb_grain_block = NULL;
  291|       |
  292|  14.6k|  aom_free(*cr_grain_block);
  293|  14.6k|  *cr_grain_block = NULL;
  294|  14.6k|}

aom_get_num_layers_from_operating_point_idc:
   31|   200k|    unsigned int *number_temporal_layers) {
   32|       |  // derive number of spatial/temporal layers from operating_point_idc
   33|       |
   34|   200k|  if (!number_spatial_layers || !number_temporal_layers)
  ------------------
  |  Branch (34:7): [True: 0, False: 200k]
  |  Branch (34:33): [True: 0, False: 200k]
  ------------------
   35|      0|    return AOM_CODEC_INVALID_PARAM;
   36|       |
   37|   200k|  if (operating_point_idc == 0) {
  ------------------
  |  Branch (37:7): [True: 144k, False: 56.8k]
  ------------------
   38|   144k|    *number_temporal_layers = 1;
   39|   144k|    *number_spatial_layers = 1;
   40|   144k|  } else {
   41|  56.8k|    *number_spatial_layers = 0;
   42|  56.8k|    *number_temporal_layers = 0;
   43|   284k|    for (int j = 0; j < MAX_NUM_SPATIAL_LAYERS; j++) {
  ------------------
  |  |   71|   284k|#define MAX_NUM_SPATIAL_LAYERS 4
  ------------------
  |  Branch (43:21): [True: 227k, False: 56.8k]
  ------------------
   44|   227k|      *number_spatial_layers +=
   45|   227k|          (operating_point_idc >> (j + MAX_NUM_TEMPORAL_LAYERS)) & 0x1;
  ------------------
  |  |   70|   227k|#define MAX_NUM_TEMPORAL_LAYERS 8
  ------------------
   46|   227k|    }
   47|   511k|    for (int j = 0; j < MAX_NUM_TEMPORAL_LAYERS; j++) {
  ------------------
  |  |   70|   511k|#define MAX_NUM_TEMPORAL_LAYERS 8
  ------------------
  |  Branch (47:21): [True: 454k, False: 56.8k]
  ------------------
   48|   454k|      *number_temporal_layers += (operating_point_idc >> j) & 0x1;
   49|   454k|    }
   50|  56.8k|  }
   51|       |
   52|   200k|  return AOM_CODEC_OK;
   53|   200k|}
aom_decode_frame_from_obus:
  871|   311k|                               const uint8_t **p_data_end) {
  872|   311k|  AV1_COMMON *const cm = &pbi->common;
  873|   311k|  int frame_decoding_finished = 0;
  874|   311k|  int is_first_tg_obu_received = 1;
  875|       |  // Whenever pbi->seen_frame_header is set to 1, frame_header is set to the
  876|       |  // beginning of the frame_header_obu and frame_header_size is set to its
  877|       |  // size. This allows us to check if a redundant frame_header_obu is a copy
  878|       |  // of the previous frame_header_obu.
  879|       |  //
  880|       |  // Initialize frame_header to a dummy nonnull pointer, otherwise the Clang
  881|       |  // Static Analyzer in clang 7.0.1 will falsely warn that a null pointer is
  882|       |  // passed as an argument to a 'nonnull' parameter of memcmp(). The initial
  883|       |  // value will not be used.
  884|   311k|  const uint8_t *frame_header = data;
  885|   311k|  uint32_t frame_header_size = 0;
  886|   311k|  ObuHeader obu_header;
  887|   311k|  memset(&obu_header, 0, sizeof(obu_header));
  888|   311k|  pbi->seen_frame_header = 0;
  889|   311k|  pbi->next_start_tile = 0;
  890|   311k|  pbi->num_tile_groups = 0;
  891|       |
  892|   311k|  if (data_end < data) {
  ------------------
  |  Branch (892:7): [True: 0, False: 311k]
  ------------------
  893|      0|    pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
  894|      0|    return -1;
  895|      0|  }
  896|       |
  897|       |  // Reset pbi->camera_frame_header_ready to 0 if cm->tiles.large_scale = 0.
  898|   311k|  if (!cm->tiles.large_scale) pbi->camera_frame_header_ready = 0;
  ------------------
  |  Branch (898:7): [True: 241k, False: 69.9k]
  ------------------
  899|       |
  900|       |  // decode frame as a series of OBUs
  901|   642k|  while (!frame_decoding_finished && pbi->error.error_code == AOM_CODEC_OK) {
  ------------------
  |  Branch (901:10): [True: 537k, False: 105k]
  |  Branch (901:38): [True: 537k, False: 107]
  ------------------
  902|   537k|    struct aom_read_bit_buffer rb;
  903|   537k|    size_t payload_size = 0;
  904|   537k|    size_t decoded_payload_size = 0;
  905|   537k|    size_t obu_payload_offset = 0;
  906|   537k|    size_t bytes_read = 0;
  907|   537k|    const size_t bytes_available = data_end - data;
  908|       |
  909|   537k|    if (bytes_available == 0 && !pbi->seen_frame_header) {
  ------------------
  |  Branch (909:9): [True: 929, False: 536k]
  |  Branch (909:33): [True: 697, False: 232]
  ------------------
  910|    697|      *p_data_end = data;
  911|    697|      pbi->error.error_code = AOM_CODEC_OK;
  912|    697|      break;
  913|    697|    }
  914|       |
  915|   536k|    aom_codec_err_t status =
  916|   536k|        aom_read_obu_header_and_size(data, bytes_available, pbi->is_annexb,
  917|   536k|                                     &obu_header, &payload_size, &bytes_read);
  918|       |
  919|   536k|    if (status != AOM_CODEC_OK) {
  ------------------
  |  Branch (919:9): [True: 28.9k, False: 507k]
  ------------------
  920|  28.9k|      pbi->error.error_code = status;
  921|  28.9k|      return -1;
  922|  28.9k|    }
  923|       |
  924|       |    // Record obu size header information.
  925|   507k|    pbi->obu_size_hdr.data = data + obu_header.size;
  926|   507k|    pbi->obu_size_hdr.size = bytes_read - obu_header.size;
  927|       |
  928|       |    // Note: aom_read_obu_header_and_size() takes care of checking that this
  929|       |    // doesn't cause 'data' to advance past 'data_end'.
  930|   507k|    data += bytes_read;
  931|       |
  932|   507k|    if ((size_t)(data_end - data) < payload_size) {
  ------------------
  |  Branch (932:9): [True: 8.17k, False: 499k]
  ------------------
  933|  8.17k|      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
  934|  8.17k|      return -1;
  935|  8.17k|    }
  936|       |
  937|   499k|    cm->temporal_layer_id = obu_header.temporal_layer_id;
  938|   499k|    cm->spatial_layer_id = obu_header.spatial_layer_id;
  939|       |
  940|   499k|    if (obu_header.type != OBU_TEMPORAL_DELIMITER &&
  ------------------
  |  Branch (940:9): [True: 358k, False: 140k]
  ------------------
  941|   499k|        obu_header.type != OBU_SEQUENCE_HEADER) {
  ------------------
  |  Branch (941:9): [True: 271k, False: 87.4k]
  ------------------
  942|       |      // don't decode obu if it's not in current operating mode
  943|   271k|      if (!is_obu_in_current_operating_point(pbi, &obu_header)) {
  ------------------
  |  Branch (943:11): [True: 1.09k, False: 270k]
  ------------------
  944|  1.09k|        data += payload_size;
  945|  1.09k|        continue;
  946|  1.09k|      }
  947|   271k|    }
  948|       |
  949|   498k|    av1_init_read_bit_buffer(pbi, &rb, data, data + payload_size);
  950|       |
  951|   498k|    switch (obu_header.type) {
  952|   140k|      case OBU_TEMPORAL_DELIMITER:
  ------------------
  |  Branch (952:7): [True: 140k, False: 357k]
  ------------------
  953|   140k|        decoded_payload_size = read_temporal_delimiter_obu();
  954|   140k|        if (pbi->seen_frame_header) {
  ------------------
  |  Branch (954:13): [True: 16, False: 140k]
  ------------------
  955|       |          // A new temporal unit has started, but the frame in the previous
  956|       |          // temporal unit is incomplete.
  957|     16|          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
  958|     16|          return -1;
  959|     16|        }
  960|   140k|        break;
  961|   140k|      case OBU_SEQUENCE_HEADER:
  ------------------
  |  Branch (961:7): [True: 87.4k, False: 411k]
  ------------------
  962|  87.4k|        decoded_payload_size = read_sequence_header_obu(pbi, &rb);
  963|  87.4k|        if (pbi->error.error_code != AOM_CODEC_OK) return -1;
  ------------------
  |  Branch (963:13): [True: 8.05k, False: 79.3k]
  ------------------
  964|       |        // The sequence header should not change in the middle of a frame.
  965|  79.3k|        if (pbi->sequence_header_changed && pbi->seen_frame_header) {
  ------------------
  |  Branch (965:13): [True: 29.7k, False: 49.6k]
  |  Branch (965:45): [True: 2, False: 29.7k]
  ------------------
  966|      2|          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
  967|      2|          return -1;
  968|      2|        }
  969|  79.3k|        break;
  970|  79.3k|      case OBU_FRAME_HEADER:
  ------------------
  |  Branch (970:7): [True: 1.05k, False: 497k]
  ------------------
  971|  1.75k|      case OBU_REDUNDANT_FRAME_HEADER:
  ------------------
  |  Branch (971:7): [True: 703, False: 497k]
  ------------------
  972|   252k|      case OBU_FRAME:
  ------------------
  |  Branch (972:7): [True: 250k, False: 247k]
  ------------------
  973|   252k|        if (obu_header.type == OBU_REDUNDANT_FRAME_HEADER) {
  ------------------
  |  Branch (973:13): [True: 703, False: 252k]
  ------------------
  974|    703|          if (!pbi->seen_frame_header) {
  ------------------
  |  Branch (974:15): [True: 55, False: 648]
  ------------------
  975|     55|            pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
  976|     55|            return -1;
  977|     55|          }
  978|   252k|        } else {
  979|       |          // OBU_FRAME_HEADER or OBU_FRAME.
  980|   252k|          if (pbi->seen_frame_header) {
  ------------------
  |  Branch (980:15): [True: 21, False: 251k]
  ------------------
  981|     21|            pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
  982|     21|            return -1;
  983|     21|          }
  984|   252k|        }
  985|       |        // Only decode first frame header received
  986|   252k|        if (!pbi->seen_frame_header ||
  ------------------
  |  Branch (986:13): [True: 251k, False: 648]
  ------------------
  987|   252k|            (cm->tiles.large_scale && !pbi->camera_frame_header_ready)) {
  ------------------
  |  Branch (987:14): [True: 0, False: 648]
  |  Branch (987:39): [True: 0, False: 0]
  ------------------
  988|   251k|          frame_header_size = read_frame_header_obu(
  989|   251k|              pbi, &rb, data, p_data_end, obu_header.type != OBU_FRAME);
  990|   251k|          frame_header = data;
  991|   251k|          pbi->seen_frame_header = 1;
  992|   251k|          if (!pbi->ext_tile_debug && cm->tiles.large_scale)
  ------------------
  |  Branch (992:15): [True: 130k, False: 121k]
  |  Branch (992:39): [True: 14.0k, False: 116k]
  ------------------
  993|  14.0k|            pbi->camera_frame_header_ready = 1;
  994|   251k|        } else {
  995|       |          // Verify that the frame_header_obu is identical to the original
  996|       |          // frame_header_obu.
  997|    648|          if (frame_header_size > payload_size ||
  ------------------
  |  Branch (997:15): [True: 18, False: 630]
  ------------------
  998|    648|              memcmp(data, frame_header, frame_header_size) != 0) {
  ------------------
  |  Branch (998:15): [True: 347, False: 283]
  ------------------
  999|    365|            pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
 1000|    365|            return -1;
 1001|    365|          }
 1002|    283|          assert(rb.bit_offset == 0);
 1003|    283|          rb.bit_offset = 8 * frame_header_size;
 1004|    283|        }
 1005|       |
 1006|   252k|        decoded_payload_size = frame_header_size;
 1007|   252k|        pbi->frame_header_size = frame_header_size;
 1008|   252k|        cm->cur_frame->temporal_id = obu_header.temporal_layer_id;
 1009|   252k|        cm->cur_frame->spatial_id = obu_header.spatial_layer_id;
 1010|       |
 1011|   252k|        if (cm->show_existing_frame) {
  ------------------
  |  Branch (1011:13): [True: 807, False: 251k]
  ------------------
 1012|    807|          if (obu_header.type == OBU_FRAME) {
  ------------------
  |  Branch (1012:15): [True: 663, False: 144]
  ------------------
 1013|    663|            pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
 1014|    663|            return -1;
 1015|    663|          }
 1016|    144|          frame_decoding_finished = 1;
 1017|    144|          pbi->seen_frame_header = 0;
 1018|       |
 1019|    144|          if (cm->show_frame &&
  ------------------
  |  Branch (1019:15): [True: 144, False: 0]
  ------------------
 1020|    144|              !cm->seq_params->order_hint_info.enable_order_hint) {
  ------------------
  |  Branch (1020:15): [True: 0, False: 144]
  ------------------
 1021|      0|            ++cm->current_frame.frame_number;
 1022|      0|          }
 1023|    144|          break;
 1024|    807|        }
 1025|       |
 1026|       |        // In large scale tile coding, decode the common camera frame header
 1027|       |        // before any tile list OBU.
 1028|   251k|        if (!pbi->ext_tile_debug && pbi->camera_frame_header_ready) {
  ------------------
  |  Branch (1028:13): [True: 129k, False: 121k]
  |  Branch (1028:37): [True: 14.0k, False: 115k]
  ------------------
 1029|  14.0k|          frame_decoding_finished = 1;
 1030|       |          // Skip the rest of the frame data.
 1031|  14.0k|          decoded_payload_size = payload_size;
 1032|       |          // Update data_end.
 1033|  14.0k|          *p_data_end = data_end;
 1034|  14.0k|          break;
 1035|  14.0k|        }
 1036|       |
 1037|   237k|        if (obu_header.type != OBU_FRAME) break;
  ------------------
  |  Branch (1037:13): [True: 1.08k, False: 236k]
  ------------------
 1038|   236k|        obu_payload_offset = frame_header_size;
 1039|       |        // Byte align the reader before reading the tile group.
 1040|       |        // byte_alignment() has set pbi->error.error_code if it returns -1.
 1041|   236k|        if (byte_alignment(cm, &rb)) return -1;
  ------------------
  |  Branch (1041:13): [True: 4.65k, False: 231k]
  ------------------
 1042|   231k|        AOM_FALLTHROUGH_INTENDED;  // fall through to read tile group.
  ------------------
  |  |   52|   231k|  do {                           \
  |  |   53|   231k|  } while (0)
  |  |  ------------------
  |  |  |  Branch (53:12): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1043|   231k|      case OBU_TILE_GROUP:
  ------------------
  |  Branch (1043:7): [True: 119, False: 498k]
  ------------------
 1044|   231k|        if (!pbi->seen_frame_header) {
  ------------------
  |  Branch (1044:13): [True: 94, False: 231k]
  ------------------
 1045|     94|          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
 1046|     94|          return -1;
 1047|     94|        }
 1048|   231k|        if (obu_payload_offset > payload_size) {
  ------------------
  |  Branch (1048:13): [True: 0, False: 231k]
  ------------------
 1049|      0|          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
 1050|      0|          return -1;
 1051|      0|        }
 1052|   231k|        decoded_payload_size += read_one_tile_group_obu(
 1053|   231k|            pbi, &rb, is_first_tg_obu_received, data + obu_payload_offset,
 1054|   231k|            data + payload_size, p_data_end, &frame_decoding_finished,
 1055|   231k|            obu_header.type == OBU_FRAME);
 1056|   231k|        if (pbi->error.error_code != AOM_CODEC_OK) return -1;
  ------------------
  |  Branch (1056:13): [True: 2.16k, False: 229k]
  ------------------
 1057|   229k|        is_first_tg_obu_received = 0;
 1058|   229k|        if (frame_decoding_finished) {
  ------------------
  |  Branch (1058:13): [True: 91.1k, False: 138k]
  ------------------
 1059|  91.1k|          pbi->seen_frame_header = 0;
 1060|  91.1k|          pbi->next_start_tile = 0;
 1061|  91.1k|        }
 1062|   229k|        pbi->num_tile_groups++;
 1063|   229k|        break;
 1064|  10.9k|      case OBU_METADATA: {
  ------------------
  |  Branch (1064:7): [True: 10.9k, False: 487k]
  ------------------
 1065|  10.9k|        decoded_payload_size =
 1066|  10.9k|            read_metadata(pbi, data, payload_size, obu_header.has_extension);
 1067|  10.9k|        if (pbi->error.error_code != AOM_CODEC_OK) return -1;
  ------------------
  |  Branch (1067:13): [True: 4.64k, False: 6.33k]
  ------------------
 1068|  6.33k|        break;
 1069|  10.9k|      }
 1070|  6.33k|      case OBU_TILE_LIST:
  ------------------
  |  Branch (1070:7): [True: 1.05k, False: 497k]
  ------------------
 1071|  1.05k|        if (CONFIG_NORMAL_TILE_MODE) {
  ------------------
  |  |   54|  1.05k|#define CONFIG_NORMAL_TILE_MODE 0
  |  |  ------------------
  |  |  |  Branch (54:33): [Folded - Ignored]
  |  |  ------------------
  ------------------
 1072|      0|          pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
 1073|      0|          return -1;
 1074|      0|        }
 1075|       |
 1076|       |        // This OBU type is purely for the large scale tile coding mode.
 1077|       |        // The common camera frame header has to be already decoded.
 1078|  1.05k|        if (!pbi->camera_frame_header_ready) {
  ------------------
  |  Branch (1078:13): [True: 257, False: 801]
  ------------------
 1079|    257|          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
 1080|    257|          return -1;
 1081|    257|        }
 1082|       |
 1083|    801|        cm->tiles.large_scale = 1;
 1084|    801|        av1_set_single_tile_decoding_mode(cm);
 1085|    801|        decoded_payload_size =
 1086|    801|            read_and_decode_one_tile_list(pbi, &rb, data, data + payload_size,
 1087|    801|                                          p_data_end, &frame_decoding_finished);
 1088|    801|        if (pbi->error.error_code != AOM_CODEC_OK) return -1;
  ------------------
  |  Branch (1088:13): [True: 271, False: 530]
  ------------------
 1089|    530|        break;
 1090|  3.39k|      case OBU_PADDING:
  ------------------
  |  Branch (1090:7): [True: 3.39k, False: 495k]
  ------------------
 1091|  3.39k|        decoded_payload_size = read_padding(cm, data, payload_size);
 1092|  3.39k|        if (pbi->error.error_code != AOM_CODEC_OK) return -1;
  ------------------
  |  Branch (1092:13): [True: 44, False: 3.35k]
  ------------------
 1093|  3.35k|        break;
 1094|  3.35k|      default:
  ------------------
  |  Branch (1094:7): [True: 2.17k, False: 496k]
  ------------------
 1095|       |        // Skip unrecognized OBUs
 1096|  2.17k|        if (payload_size > 0 &&
  ------------------
  |  Branch (1096:13): [True: 1.62k, False: 559]
  ------------------
 1097|  2.17k|            get_last_nonzero_byte(data, payload_size) == 0) {
  ------------------
  |  Branch (1097:13): [True: 20, False: 1.60k]
  ------------------
 1098|     20|          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
 1099|     20|          return -1;
 1100|     20|        }
 1101|  2.15k|        decoded_payload_size = payload_size;
 1102|  2.15k|        break;
 1103|   498k|    }
 1104|       |
 1105|       |    // Check that the signalled OBU size matches the actual amount of data read
 1106|   332k|    if (decoded_payload_size > payload_size) {
  ------------------
  |  Branch (1106:9): [True: 0, False: 332k]
  ------------------
 1107|      0|      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
 1108|      0|      return -1;
 1109|      0|    }
 1110|       |
 1111|       |    // If there are extra padding bytes, they should all be zero
 1112|   354k|    while (decoded_payload_size < payload_size) {
  ------------------
  |  Branch (1112:12): [True: 23.9k, False: 330k]
  ------------------
 1113|  23.9k|      uint8_t padding_byte = data[decoded_payload_size++];
 1114|  23.9k|      if (padding_byte != 0) {
  ------------------
  |  Branch (1114:11): [True: 2.10k, False: 21.8k]
  ------------------
 1115|  2.10k|        pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
 1116|  2.10k|        return -1;
 1117|  2.10k|      }
 1118|  23.9k|    }
 1119|       |
 1120|   330k|    data += payload_size;
 1121|   330k|  }
 1122|       |
 1123|   106k|  if (pbi->error.error_code != AOM_CODEC_OK) return -1;
  ------------------
  |  Branch (1123:7): [True: 108, False: 105k]
  ------------------
 1124|   105k|  return frame_decoding_finished;
 1125|   106k|}
obu.c:is_obu_in_current_operating_point:
   56|   271k|                                             const ObuHeader *obu_header) {
   57|   271k|  if (!pbi->current_operating_point || !obu_header->has_extension) {
  ------------------
  |  Branch (57:7): [True: 205k, False: 66.2k]
  |  Branch (57:40): [True: 64.9k, False: 1.29k]
  ------------------
   58|   270k|    return 1;
   59|   270k|  }
   60|       |
   61|  1.29k|  if ((pbi->current_operating_point >> obu_header->temporal_layer_id) & 0x1 &&
  ------------------
  |  Branch (61:7): [True: 676, False: 618]
  ------------------
   62|  1.29k|      (pbi->current_operating_point >> (obu_header->spatial_layer_id + 8)) &
  ------------------
  |  Branch (62:7): [True: 203, False: 473]
  ------------------
   63|    676|          0x1) {
   64|    203|    return 1;
   65|    203|  }
   66|  1.09k|  return 0;
   67|  1.29k|}
obu.c:read_temporal_delimiter_obu:
   80|   140k|static uint32_t read_temporal_delimiter_obu(void) { return 0; }
obu.c:read_sequence_header_obu:
  105|  87.4k|                                         struct aom_read_bit_buffer *rb) {
  106|  87.4k|  AV1_COMMON *const cm = &pbi->common;
  107|  87.4k|  const uint32_t saved_bit_offset = rb->bit_offset;
  108|       |
  109|       |  // Verify rb has been configured to report errors.
  110|  87.4k|  assert(rb->error_handler);
  111|       |
  112|       |  // Use a local variable to store the information as we decode. At the end,
  113|       |  // if no errors have occurred, cm->seq_params is updated.
  114|  87.4k|  SequenceHeader sh = *cm->seq_params;
  115|  87.4k|  SequenceHeader *const seq_params = &sh;
  116|       |
  117|  87.4k|  seq_params->profile = av1_read_profile(rb);
  118|  87.4k|  if (seq_params->profile > CONFIG_MAX_DECODE_PROFILE) {
  ------------------
  |  |   50|  87.4k|#define CONFIG_MAX_DECODE_PROFILE 2
  ------------------
  |  Branch (118:7): [True: 258, False: 87.1k]
  ------------------
  119|    258|    pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
  120|    258|    return 0;
  121|    258|  }
  122|       |
  123|       |  // Still picture or not
  124|  87.1k|  seq_params->still_picture = aom_rb_read_bit(rb);
  125|  87.1k|  seq_params->reduced_still_picture_hdr = aom_rb_read_bit(rb);
  126|       |  // Video must have reduced_still_picture_hdr = 0
  127|  87.1k|  if (!seq_params->still_picture && seq_params->reduced_still_picture_hdr) {
  ------------------
  |  Branch (127:7): [True: 68.2k, False: 18.9k]
  |  Branch (127:37): [True: 30, False: 68.1k]
  ------------------
  128|     30|    pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
  129|     30|    return 0;
  130|     30|  }
  131|       |
  132|  87.1k|  if (seq_params->reduced_still_picture_hdr) {
  ------------------
  |  Branch (132:7): [True: 18.1k, False: 68.9k]
  ------------------
  133|  18.1k|    seq_params->timing_info_present = 0;
  134|  18.1k|    seq_params->decoder_model_info_present_flag = 0;
  135|  18.1k|    seq_params->display_model_info_present_flag = 0;
  136|  18.1k|    seq_params->operating_points_cnt_minus_1 = 0;
  137|  18.1k|    seq_params->operating_point_idc[0] = 0;
  138|  18.1k|    seq_params->has_nonzero_operating_point_idc = false;
  139|  18.1k|    if (!read_bitstream_level(&seq_params->seq_level_idx[0], rb)) {
  ------------------
  |  Branch (139:9): [True: 515, False: 17.6k]
  ------------------
  140|    515|      pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
  141|    515|      return 0;
  142|    515|    }
  143|  17.6k|    seq_params->tier[0] = 0;
  144|  17.6k|    seq_params->op_params[0].decoder_model_param_present_flag = 0;
  145|  17.6k|    seq_params->op_params[0].display_model_param_present_flag = 0;
  146|  68.9k|  } else {
  147|  68.9k|    seq_params->timing_info_present = aom_rb_read_bit(rb);
  148|  68.9k|    if (seq_params->timing_info_present) {
  ------------------
  |  Branch (148:9): [True: 3.66k, False: 65.3k]
  ------------------
  149|  3.66k|      av1_read_timing_info_header(&seq_params->timing_info, &pbi->error, rb);
  150|       |
  151|  3.66k|      seq_params->decoder_model_info_present_flag = aom_rb_read_bit(rb);
  152|  3.66k|      if (seq_params->decoder_model_info_present_flag)
  ------------------
  |  Branch (152:11): [True: 1.09k, False: 2.56k]
  ------------------
  153|  1.09k|        av1_read_decoder_model_info(&seq_params->decoder_model_info, rb);
  154|  65.3k|    } else {
  155|  65.3k|      seq_params->decoder_model_info_present_flag = 0;
  156|  65.3k|    }
  157|  68.9k|    seq_params->display_model_info_present_flag = aom_rb_read_bit(rb);
  158|  68.9k|    seq_params->operating_points_cnt_minus_1 =
  159|  68.9k|        aom_rb_read_literal(rb, OP_POINTS_CNT_MINUS_1_BITS);
  ------------------
  |  |   93|  68.9k|#define OP_POINTS_CNT_MINUS_1_BITS 5
  ------------------
  160|  68.9k|    seq_params->has_nonzero_operating_point_idc = false;
  161|   141k|    for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) {
  ------------------
  |  Branch (161:21): [True: 76.5k, False: 64.9k]
  ------------------
  162|  76.5k|      seq_params->operating_point_idc[i] =
  163|  76.5k|          aom_rb_read_literal(rb, OP_POINTS_IDC_BITS);
  ------------------
  |  |   94|  76.5k|#define OP_POINTS_IDC_BITS 12
  ------------------
  164|  76.5k|      if (seq_params->operating_point_idc[i] != 0)
  ------------------
  |  Branch (164:11): [True: 24.1k, False: 52.3k]
  ------------------
  165|  24.1k|        seq_params->has_nonzero_operating_point_idc = true;
  166|  76.5k|      if (!read_bitstream_level(&seq_params->seq_level_idx[i], rb)) {
  ------------------
  |  Branch (166:11): [True: 4.04k, False: 72.4k]
  ------------------
  167|  4.04k|        pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
  168|  4.04k|        return 0;
  169|  4.04k|      }
  170|       |      // This is the seq_level_idx[i] > 7 check in the spec. seq_level_idx 7
  171|       |      // is equivalent to level 3.3.
  172|  72.4k|      if (seq_params->seq_level_idx[i] >= SEQ_LEVEL_4_0)
  ------------------
  |  Branch (172:11): [True: 7.44k, False: 65.0k]
  ------------------
  173|  7.44k|        seq_params->tier[i] = aom_rb_read_bit(rb);
  174|  65.0k|      else
  175|  65.0k|        seq_params->tier[i] = 0;
  176|  72.4k|      if (seq_params->decoder_model_info_present_flag) {
  ------------------
  |  Branch (176:11): [True: 3.31k, False: 69.1k]
  ------------------
  177|  3.31k|        seq_params->op_params[i].decoder_model_param_present_flag =
  178|  3.31k|            aom_rb_read_bit(rb);
  179|  3.31k|        if (seq_params->op_params[i].decoder_model_param_present_flag)
  ------------------
  |  Branch (179:13): [True: 2.06k, False: 1.25k]
  ------------------
  180|  2.06k|          av1_read_op_parameters_info(&seq_params->op_params[i],
  181|  2.06k|                                      seq_params->decoder_model_info
  182|  2.06k|                                          .encoder_decoder_buffer_delay_length,
  183|  2.06k|                                      rb);
  184|  69.1k|      } else {
  185|  69.1k|        seq_params->op_params[i].decoder_model_param_present_flag = 0;
  186|  69.1k|      }
  187|  72.4k|      if (seq_params->timing_info_present &&
  ------------------
  |  Branch (187:11): [True: 6.17k, False: 66.2k]
  ------------------
  188|  72.4k|          (seq_params->timing_info.equal_picture_interval ||
  ------------------
  |  Branch (188:12): [True: 4.15k, False: 2.02k]
  ------------------
  189|  6.17k|           seq_params->op_params[i].decoder_model_param_present_flag)) {
  ------------------
  |  Branch (189:12): [True: 183, False: 1.84k]
  ------------------
  190|  4.33k|        seq_params->op_params[i].bitrate = av1_max_level_bitrate(
  191|  4.33k|            seq_params->profile, seq_params->seq_level_idx[i],
  192|  4.33k|            seq_params->tier[i]);
  193|       |        // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass
  194|       |        // the check
  195|  4.33k|        if (seq_params->op_params[i].bitrate == 0)
  ------------------
  |  Branch (195:13): [True: 0, False: 4.33k]
  ------------------
  196|      0|          aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
  197|      0|                             "AV1 does not support this combination of "
  198|      0|                             "profile, level, and tier.");
  199|       |        // Buffer size in bits/s is bitrate in bits/s * 1 s
  200|  4.33k|        seq_params->op_params[i].buffer_size = seq_params->op_params[i].bitrate;
  201|  4.33k|      }
  202|  72.4k|      if (seq_params->timing_info_present &&
  ------------------
  |  Branch (202:11): [True: 6.17k, False: 66.2k]
  ------------------
  203|  72.4k|          seq_params->timing_info.equal_picture_interval &&
  ------------------
  |  Branch (203:11): [True: 4.15k, False: 2.02k]
  ------------------
  204|  72.4k|          !seq_params->op_params[i].decoder_model_param_present_flag) {
  ------------------
  |  Branch (204:11): [True: 2.32k, False: 1.82k]
  ------------------
  205|       |        // When the decoder_model_parameters are not sent for this op, set
  206|       |        // the default ones that can be used with the resource availability mode
  207|  2.32k|        seq_params->op_params[i].decoder_buffer_delay = 70000;
  208|  2.32k|        seq_params->op_params[i].encoder_buffer_delay = 20000;
  209|  2.32k|        seq_params->op_params[i].low_delay_mode_flag = 0;
  210|  2.32k|      }
  211|       |
  212|  72.4k|      if (seq_params->display_model_info_present_flag) {
  ------------------
  |  Branch (212:11): [True: 4.39k, False: 68.0k]
  ------------------
  213|  4.39k|        seq_params->op_params[i].display_model_param_present_flag =
  214|  4.39k|            aom_rb_read_bit(rb);
  215|  4.39k|        if (seq_params->op_params[i].display_model_param_present_flag) {
  ------------------
  |  Branch (215:13): [True: 1.66k, False: 2.73k]
  ------------------
  216|  1.66k|          seq_params->op_params[i].initial_display_delay =
  217|  1.66k|              aom_rb_read_literal(rb, 4) + 1;
  218|  1.66k|          if (seq_params->op_params[i].initial_display_delay > 10)
  ------------------
  |  Branch (218:15): [True: 436, False: 1.22k]
  ------------------
  219|    436|            aom_internal_error(
  220|    436|                &pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
  221|    436|                "AV1 does not support more than 10 decoded frames delay");
  222|  2.73k|        } else {
  223|  2.73k|          seq_params->op_params[i].initial_display_delay = 10;
  224|  2.73k|        }
  225|  68.0k|      } else {
  226|  68.0k|        seq_params->op_params[i].display_model_param_present_flag = 0;
  227|  68.0k|        seq_params->op_params[i].initial_display_delay = 10;
  228|  68.0k|      }
  229|  72.4k|    }
  230|  68.9k|  }
  231|       |  // This decoder supports all levels.  Choose operating point provided by
  232|       |  // external means
  233|  82.5k|  int operating_point = pbi->operating_point;
  234|  82.5k|  if (operating_point < 0 ||
  ------------------
  |  Branch (234:7): [True: 1.35k, False: 81.2k]
  ------------------
  235|  82.5k|      operating_point > seq_params->operating_points_cnt_minus_1)
  ------------------
  |  Branch (235:7): [True: 27.5k, False: 53.7k]
  ------------------
  236|  27.5k|    operating_point = 0;
  237|  82.5k|  pbi->current_operating_point =
  238|  82.5k|      seq_params->operating_point_idc[operating_point];
  239|  82.5k|  if (aom_get_num_layers_from_operating_point_idc(
  ------------------
  |  Branch (239:7): [True: 0, False: 82.5k]
  ------------------
  240|  82.5k|          pbi->current_operating_point, &pbi->number_spatial_layers,
  241|  82.5k|          &pbi->number_temporal_layers) != AOM_CODEC_OK) {
  242|      0|    pbi->error.error_code = AOM_CODEC_ERROR;
  243|      0|    return 0;
  244|      0|  }
  245|       |
  246|  82.5k|  av1_read_sequence_header(cm, rb, seq_params);
  247|       |
  248|  82.5k|  av1_read_color_config(rb, pbi->allow_lowbitdepth, seq_params, &pbi->error);
  249|  82.5k|  if (!(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0) &&
  ------------------
  |  Branch (249:9): [True: 29.3k, False: 53.2k]
  |  Branch (249:43): [True: 29.3k, False: 0]
  ------------------
  250|  82.5k|      !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) &&
  ------------------
  |  Branch (250:9): [True: 49.6k, False: 0]
  |  Branch (250:43): [True: 43.7k, False: 5.91k]
  ------------------
  251|  82.5k|      !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 0)) {
  ------------------
  |  Branch (251:9): [True: 5.91k, False: 0]
  |  Branch (251:43): [True: 5.91k, False: 0]
  ------------------
  252|      0|    aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
  253|      0|                       "Only 4:4:4, 4:2:2 and 4:2:0 are currently supported, "
  254|      0|                       "%d %d subsampling is not supported.\n",
  255|      0|                       seq_params->subsampling_x, seq_params->subsampling_y);
  256|      0|  }
  257|       |
  258|  82.5k|  seq_params->film_grain_params_present = aom_rb_read_bit(rb);
  259|       |
  260|  82.5k|  if (av1_check_trailing_bits(pbi, rb) != 0) {
  ------------------
  |  Branch (260:7): [True: 3.21k, False: 79.3k]
  ------------------
  261|       |    // pbi->error.error_code is already set.
  262|  3.21k|    return 0;
  263|  3.21k|  }
  264|       |
  265|       |  // If a sequence header has been decoded before, we check if the new
  266|       |  // one is consistent with the old one.
  267|  79.3k|  if (pbi->sequence_header_ready) {
  ------------------
  |  Branch (267:7): [True: 61.0k, False: 18.2k]
  ------------------
  268|  61.0k|    if (!are_seq_headers_consistent(cm->seq_params, seq_params))
  ------------------
  |  Branch (268:9): [True: 29.2k, False: 31.8k]
  ------------------
  269|  29.2k|      pbi->sequence_header_changed = 1;
  270|  61.0k|  }
  271|       |
  272|  79.3k|  *cm->seq_params = *seq_params;
  273|  79.3k|  pbi->sequence_header_ready = 1;
  274|       |
  275|  79.3k|  return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
  276|  82.5k|}
obu.c:read_bitstream_level:
   84|  94.5k|                                struct aom_read_bit_buffer *rb) {
   85|  94.5k|  *seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS);
  ------------------
  |  |  464|  94.5k|#define LEVEL_BITS 5
  ------------------
   86|  94.5k|  if (!is_valid_seq_level_idx(*seq_level_idx)) return 0;
  ------------------
  |  Branch (86:7): [True: 4.55k, False: 90.0k]
  ------------------
   87|  90.0k|  return 1;
   88|  94.5k|}
obu.c:are_seq_headers_consistent:
   96|  61.0k|                                      const SequenceHeader *seq_params_new) {
   97|  61.0k|  return !memcmp(seq_params_old, seq_params_new,
   98|  61.0k|                 offsetof(SequenceHeader, op_params));
   99|  61.0k|}
obu.c:read_frame_header_obu:
  285|   251k|                                      int trailing_bits_present) {
  286|   251k|  const uint32_t hdr_size =
  287|   251k|      av1_decode_frame_headers_and_setup(pbi, rb, trailing_bits_present);
  288|   251k|  const AV1_COMMON *cm = &pbi->common;
  289|   251k|  if (cm->show_existing_frame) {
  ------------------
  |  Branch (289:7): [True: 807, False: 251k]
  ------------------
  290|    807|    *p_data_end = data + hdr_size;
  291|    807|  }
  292|   251k|  return hdr_size;
  293|   251k|}
obu.c:byte_alignment:
   70|   315k|                          struct aom_read_bit_buffer *const rb) {
   71|   804k|  while (rb->bit_offset & 7) {
  ------------------
  |  Branch (71:10): [True: 495k, False: 308k]
  ------------------
   72|   495k|    if (aom_rb_read_bit(rb)) {
  ------------------
  |  Branch (72:9): [True: 6.82k, False: 488k]
  ------------------
   73|  6.82k|      cm->error->error_code = AOM_CODEC_CORRUPT_FRAME;
   74|  6.82k|      return -1;
   75|  6.82k|    }
   76|   495k|  }
   77|   308k|  return 0;
   78|   315k|}
obu.c:read_one_tile_group_obu:
  354|   156k|    int *is_last_tg, int tile_start_implicit) {
  355|   156k|  AV1_COMMON *const cm = &pbi->common;
  356|   156k|  int start_tile, end_tile;
  357|   156k|  int32_t header_size, tg_payload_size;
  358|       |
  359|   156k|  assert((rb->bit_offset & 7) == 0);
  360|   156k|  assert(rb->bit_buffer + aom_rb_bytes_read(rb) == data);
  361|       |
  362|   156k|  header_size = read_tile_group_header(pbi, rb, &start_tile, &end_tile,
  363|   156k|                                       tile_start_implicit);
  364|   156k|  if (header_size == -1 || byte_alignment(cm, rb)) return 0;
  ------------------
  |  Branch (364:7): [True: 1.22k, False: 154k]
  |  Branch (364:28): [True: 2.16k, False: 152k]
  ------------------
  365|   154k|  data += header_size;
  366|   154k|  av1_decode_tg_tiles_and_wrapup(pbi, data, data_end, p_data_end, start_tile,
  367|   154k|                                 end_tile, is_first_tg);
  368|       |
  369|   154k|  tg_payload_size = (uint32_t)(*p_data_end - data);
  370|       |
  371|   154k|  *is_last_tg = end_tile == cm->tiles.rows * cm->tiles.cols - 1;
  372|   154k|  return header_size + tg_payload_size;
  373|   156k|}
obu.c:read_tile_group_header:
  300|   156k|                                      int tile_start_implicit) {
  301|   156k|  AV1_COMMON *const cm = &pbi->common;
  302|   156k|  CommonTileParams *const tiles = &cm->tiles;
  303|   156k|  uint32_t saved_bit_offset = rb->bit_offset;
  304|   156k|  int tile_start_and_end_present_flag = 0;
  305|   156k|  const int num_tiles = tiles->rows * tiles->cols;
  306|       |
  307|   156k|  if (!tiles->large_scale && num_tiles > 1) {
  ------------------
  |  Branch (307:7): [True: 135k, False: 20.8k]
  |  Branch (307:30): [True: 8.62k, False: 126k]
  ------------------
  308|  8.62k|    tile_start_and_end_present_flag = aom_rb_read_bit(rb);
  309|  8.62k|    if (tile_start_implicit && tile_start_and_end_present_flag) {
  ------------------
  |  Branch (309:9): [True: 8.55k, False: 70]
  |  Branch (309:32): [True: 1.18k, False: 7.37k]
  ------------------
  310|  1.18k|      aom_internal_error(
  311|  1.18k|          &pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
  312|  1.18k|          "For OBU_FRAME type obu tile_start_and_end_present_flag must be 0");
  313|  1.18k|      return -1;
  314|  1.18k|    }
  315|  8.62k|  }
  316|   155k|  if (tiles->large_scale || num_tiles == 1 ||
  ------------------
  |  Branch (316:7): [True: 20.9k, False: 134k]
  |  Branch (316:29): [True: 126k, False: 7.39k]
  ------------------
  317|   155k|      !tile_start_and_end_present_flag) {
  ------------------
  |  Branch (317:7): [True: 7.37k, False: 24]
  ------------------
  318|   154k|    *start_tile = 0;
  319|   154k|    *end_tile = num_tiles - 1;
  320|   154k|  } else {
  321|     69|    int tile_bits = tiles->log2_rows + tiles->log2_cols;
  322|     69|    *start_tile = aom_rb_read_literal(rb, tile_bits);
  323|     69|    *end_tile = aom_rb_read_literal(rb, tile_bits);
  324|     69|  }
  325|   155k|  if (*start_tile != pbi->next_start_tile) {
  ------------------
  |  Branch (325:7): [True: 2, False: 155k]
  ------------------
  326|      2|    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
  327|      2|                       "tg_start (%d) must be equal to %d", *start_tile,
  328|      2|                       pbi->next_start_tile);
  329|      2|    return -1;
  330|      2|  }
  331|   155k|  if (*start_tile > *end_tile) {
  ------------------
  |  Branch (331:7): [True: 0, False: 155k]
  ------------------
  332|      0|    aom_internal_error(
  333|      0|        &pbi->error, AOM_CODEC_CORRUPT_FRAME,
  334|      0|        "tg_end (%d) must be greater than or equal to tg_start (%d)", *end_tile,
  335|      0|        *start_tile);
  336|      0|    return -1;
  337|      0|  }
  338|   155k|  if (*end_tile >= num_tiles) {
  ------------------
  |  Branch (338:7): [True: 1, False: 155k]
  ------------------
  339|      1|    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
  340|      1|                       "tg_end (%d) must be less than NumTiles (%d)", *end_tile,
  341|      1|                       num_tiles);
  342|      1|    return -1;
  343|      1|  }
  344|   155k|  pbi->next_start_tile = (*end_tile == num_tiles - 1) ? 0 : *end_tile + 1;
  ------------------
  |  Branch (344:26): [True: 154k, False: 66]
  ------------------
  345|       |
  346|   155k|  return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
  347|   155k|}
obu.c:read_metadata:
  790|  10.9k|                            bool has_obu_extension_header) {
  791|  10.9k|  size_t type_length;
  792|  10.9k|  uint64_t type_value;
  793|  10.9k|  if (aom_uleb_decode(data, sz, &type_value, &type_length) < 0) {
  ------------------
  |  Branch (793:7): [True: 130, False: 10.8k]
  ------------------
  794|    130|    pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
  795|    130|    return 0;
  796|    130|  }
  797|  10.8k|  const OBU_METADATA_TYPE metadata_type = (OBU_METADATA_TYPE)type_value;
  798|  10.8k|  if (metadata_type == 0 || metadata_type >= 6) {
  ------------------
  |  Branch (798:7): [True: 1.17k, False: 9.67k]
  |  Branch (798:29): [True: 2.60k, False: 7.07k]
  ------------------
  799|       |    // If metadata_type is reserved for future use or a user private value,
  800|       |    // ignore the entire OBU and just check trailing bits.
  801|  3.77k|    if (get_last_nonzero_byte(data + type_length, sz - type_length) == 0) {
  ------------------
  |  Branch (801:9): [True: 356, False: 3.42k]
  ------------------
  802|    356|      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
  803|    356|      return 0;
  804|    356|    }
  805|  3.42k|    return sz;
  806|  3.77k|  }
  807|  7.07k|  if (metadata_type == OBU_METADATA_TYPE_ITUT_T35) {
  ------------------
  |  Branch (807:7): [True: 587, False: 6.48k]
  ------------------
  808|       |    // read_metadata_itut_t35() checks trailing bits.
  809|    587|    read_metadata_itut_t35(pbi, data + type_length, sz - type_length,
  810|    587|                           has_obu_extension_header);
  811|    587|    return sz;
  812|  6.48k|  } else if (metadata_type == OBU_METADATA_TYPE_HDR_CLL) {
  ------------------
  |  Branch (812:14): [True: 627, False: 5.86k]
  ------------------
  813|    627|    size_t bytes_read =
  814|    627|        type_length +
  815|    627|        read_metadata_hdr_cll(pbi, data + type_length, sz - type_length);
  816|    627|    if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) {
  ------------------
  |  Branch (816:9): [True: 506, False: 121]
  ------------------
  817|    506|      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
  818|    506|      return 0;
  819|    506|    }
  820|    121|    return sz;
  821|  5.86k|  } else if (metadata_type == OBU_METADATA_TYPE_HDR_MDCV) {
  ------------------
  |  Branch (821:14): [True: 674, False: 5.18k]
  ------------------
  822|    674|    size_t bytes_read =
  823|    674|        type_length +
  824|    674|        read_metadata_hdr_mdcv(pbi, data + type_length, sz - type_length);
  825|    674|    if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) {
  ------------------
  |  Branch (825:9): [True: 366, False: 308]
  ------------------
  826|    366|      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
  827|    366|      return 0;
  828|    366|    }
  829|    308|    return sz;
  830|    674|  }
  831|       |
  832|  5.18k|  struct aom_read_bit_buffer rb;
  833|  5.18k|  av1_init_read_bit_buffer(pbi, &rb, data + type_length, data + sz);
  834|  5.18k|  if (metadata_type == OBU_METADATA_TYPE_SCALABILITY) {
  ------------------
  |  Branch (834:7): [True: 4.47k, False: 707]
  ------------------
  835|  4.47k|    read_metadata_scalability(&rb);
  836|  4.47k|  } else {
  837|    707|    assert(metadata_type == OBU_METADATA_TYPE_TIMECODE);
  838|    707|    read_metadata_timecode(&rb);
  839|    707|  }
  840|  5.18k|  if (av1_check_trailing_bits(pbi, &rb) != 0) {
  ------------------
  |  Branch (840:7): [True: 3.28k, False: 1.89k]
  ------------------
  841|       |    // pbi->error.error_code is already set.
  842|  3.28k|    return 0;
  843|  3.28k|  }
  844|  1.89k|  assert((rb.bit_offset & 7) == 0);
  845|    158|  return type_length + (rb.bit_offset >> 3);
  846|  1.89k|}
obu.c:read_metadata_itut_t35:
  631|    587|                                   size_t sz, bool has_obu_extension_header) {
  632|    587|  if (sz == 0) {
  ------------------
  |  Branch (632:7): [True: 156, False: 431]
  ------------------
  633|    156|    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
  634|    156|                       "itu_t_t35_country_code is missing");
  635|    156|  }
  636|    587|  int country_code_size = 1;
  637|    587|  if (*data == 0xFF) {
  ------------------
  |  Branch (637:7): [True: 229, False: 358]
  ------------------
  638|    229|    if (sz == 1) {
  ------------------
  |  Branch (638:9): [True: 10, False: 219]
  ------------------
  639|     10|      aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
  640|     10|                         "itu_t_t35_country_code_extension_byte is missing");
  641|     10|    }
  642|    229|    ++country_code_size;
  643|    229|  }
  644|    587|  int end_index = get_last_nonzero_byte_index(data, sz);
  645|    587|  if (end_index < country_code_size) {
  ------------------
  |  Branch (645:7): [True: 27, False: 560]
  ------------------
  646|     27|    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
  647|     27|                       "No trailing bits found in ITU-T T.35 metadata OBU");
  648|     27|  }
  649|       |  // itu_t_t35_payload_bytes is byte aligned. Section 6.7.2 of the spec says:
  650|       |  //   itu_t_t35_payload_bytes shall be bytes containing data registered as
  651|       |  //   specified in Recommendation ITU-T T.35.
  652|       |  // Therefore the first trailing byte should be 0x80.
  653|    587|  if (data[end_index] != 0x80) {
  ------------------
  |  Branch (653:7): [True: 282, False: 305]
  ------------------
  654|    282|    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
  655|    282|                       "The last nonzero byte of the ITU-T T.35 metadata OBU "
  656|    282|                       "is 0x%02x, should be 0x80.",
  657|    282|                       data[end_index]);
  658|    282|  }
  659|    587|  alloc_read_metadata(pbi, OBU_METADATA_TYPE_ITUT_T35, data, end_index,
  660|    587|                      has_obu_extension_header
  ------------------
  |  Branch (660:23): [True: 3, False: 584]
  ------------------
  661|    587|                          ? AOM_MIF_ANY_FRAME_LAYER_SPECIFIC
  662|    587|                          : AOM_MIF_ANY_FRAME);
  663|    587|}
obu.c:get_last_nonzero_byte_index:
  589|    421|static int get_last_nonzero_byte_index(const uint8_t *data, size_t sz) {
  590|       |  // Scan backward and return on the first nonzero byte.
  591|    421|  int i = (int)sz - 1;
  592|  1.09k|  while (i >= 0 && data[i] == 0) {
  ------------------
  |  Branch (592:10): [True: 1.08k, False: 16]
  |  Branch (592:20): [True: 678, False: 405]
  ------------------
  593|    678|    --i;
  594|    678|  }
  595|    421|  return i;
  596|    421|}
obu.c:alloc_read_metadata:
  602|  1.07k|                                aom_metadata_insert_flags_t insert_flag) {
  603|  1.07k|  if (!pbi->metadata) {
  ------------------
  |  Branch (603:7): [True: 118, False: 961]
  ------------------
  604|    118|    pbi->metadata = aom_img_metadata_array_alloc(0);
  605|    118|    if (!pbi->metadata) {
  ------------------
  |  Branch (605:9): [True: 0, False: 118]
  ------------------
  606|      0|      aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
  607|      0|                         "Failed to allocate metadata array");
  608|      0|    }
  609|    118|  }
  610|  1.07k|  aom_metadata_t *metadata =
  611|  1.07k|      aom_img_metadata_alloc(metadata_type, data, sz, insert_flag);
  612|  1.07k|  if (!metadata) {
  ------------------
  |  Branch (612:7): [True: 0, False: 1.07k]
  ------------------
  613|      0|    aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
  614|      0|                       "Error allocating metadata");
  615|      0|  }
  616|  1.07k|  aom_metadata_t **metadata_array =
  617|  1.07k|      (aom_metadata_t **)realloc(pbi->metadata->metadata_array,
  618|  1.07k|                                 (pbi->metadata->sz + 1) * sizeof(metadata));
  619|  1.07k|  if (!metadata_array) {
  ------------------
  |  Branch (619:7): [True: 0, False: 1.07k]
  ------------------
  620|      0|    aom_img_metadata_free(metadata);
  621|      0|    aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
  622|      0|                       "Error growing metadata array");
  623|      0|  }
  624|  1.07k|  pbi->metadata->metadata_array = metadata_array;
  625|  1.07k|  pbi->metadata->metadata_array[pbi->metadata->sz] = metadata;
  626|  1.07k|  pbi->metadata->sz++;
  627|  1.07k|}
obu.c:read_metadata_hdr_cll:
  668|    627|                                    size_t sz) {
  669|    627|  const size_t kHdrCllPayloadSize = 4;
  670|    627|  if (sz < kHdrCllPayloadSize) {
  ------------------
  |  Branch (670:7): [True: 41, False: 586]
  ------------------
  671|     41|    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
  672|     41|                       "Incorrect HDR CLL metadata payload size");
  673|     41|  }
  674|    627|  alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_CLL, data, kHdrCllPayloadSize,
  675|    627|                      AOM_MIF_ANY_FRAME);
  676|    627|  return kHdrCllPayloadSize;
  677|    627|}
obu.c:read_metadata_hdr_mdcv:
  682|    674|                                     size_t sz) {
  683|    674|  const size_t kMdcvPayloadSize = 24;
  684|    674|  if (sz < kMdcvPayloadSize) {
  ------------------
  |  Branch (684:7): [True: 293, False: 381]
  ------------------
  685|    293|    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
  686|    293|                       "Incorrect HDR MDCV metadata payload size");
  687|    293|  }
  688|    674|  alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_MDCV, data, kMdcvPayloadSize,
  689|    674|                      AOM_MIF_ANY_FRAME);
  690|    674|  return kMdcvPayloadSize;
  691|    674|}
obu.c:read_metadata_scalability:
  727|  4.47k|static void read_metadata_scalability(struct aom_read_bit_buffer *rb) {
  728|  4.47k|  const int scalability_mode_idc = aom_rb_read_literal(rb, 8);
  729|  4.47k|  if (scalability_mode_idc == SCALABILITY_SS) {
  ------------------
  |  Branch (729:7): [True: 4.39k, False: 87]
  ------------------
  730|  4.39k|    scalability_structure(rb);
  731|  4.39k|  }
  732|  4.47k|}
obu.c:scalability_structure:
  693|  4.39k|static void scalability_structure(struct aom_read_bit_buffer *rb) {
  694|  4.39k|  const int spatial_layers_cnt_minus_1 = aom_rb_read_literal(rb, 2);
  695|  4.39k|  const int spatial_layer_dimensions_present_flag = aom_rb_read_bit(rb);
  696|  4.39k|  const int spatial_layer_description_present_flag = aom_rb_read_bit(rb);
  697|  4.39k|  const int temporal_group_description_present_flag = aom_rb_read_bit(rb);
  698|       |  // scalability_structure_reserved_3bits must be set to zero and be ignored by
  699|       |  // decoders.
  700|  4.39k|  aom_rb_read_literal(rb, 3);
  701|       |
  702|  4.39k|  if (spatial_layer_dimensions_present_flag) {
  ------------------
  |  Branch (702:7): [True: 2.11k, False: 2.27k]
  ------------------
  703|  5.02k|    for (int i = 0; i <= spatial_layers_cnt_minus_1; i++) {
  ------------------
  |  Branch (703:21): [True: 2.90k, False: 2.11k]
  ------------------
  704|  2.90k|      aom_rb_read_literal(rb, 16);
  705|  2.90k|      aom_rb_read_literal(rb, 16);
  706|  2.90k|    }
  707|  2.11k|  }
  708|  4.39k|  if (spatial_layer_description_present_flag) {
  ------------------
  |  Branch (708:7): [True: 359, False: 4.03k]
  ------------------
  709|  1.09k|    for (int i = 0; i <= spatial_layers_cnt_minus_1; i++) {
  ------------------
  |  Branch (709:21): [True: 736, False: 359]
  ------------------
  710|    736|      aom_rb_read_literal(rb, 8);
  711|    736|    }
  712|    359|  }
  713|  4.39k|  if (temporal_group_description_present_flag) {
  ------------------
  |  Branch (713:7): [True: 2.29k, False: 2.10k]
  ------------------
  714|  2.29k|    const int temporal_group_size = aom_rb_read_literal(rb, 8);
  715|  5.25k|    for (int i = 0; i < temporal_group_size; i++) {
  ------------------
  |  Branch (715:21): [True: 2.96k, False: 2.29k]
  ------------------
  716|  2.96k|      aom_rb_read_literal(rb, 3);
  717|  2.96k|      aom_rb_read_bit(rb);
  718|  2.96k|      aom_rb_read_bit(rb);
  719|  2.96k|      const int temporal_group_ref_cnt = aom_rb_read_literal(rb, 3);
  720|  4.44k|      for (int j = 0; j < temporal_group_ref_cnt; j++) {
  ------------------
  |  Branch (720:23): [True: 1.47k, False: 2.96k]
  ------------------
  721|  1.47k|        aom_rb_read_literal(rb, 8);
  722|  1.47k|      }
  723|  2.96k|    }
  724|  2.29k|  }
  725|  4.39k|}
obu.c:read_metadata_timecode:
  734|    707|static void read_metadata_timecode(struct aom_read_bit_buffer *rb) {
  735|    707|  aom_rb_read_literal(rb, 5);  // counting_type f(5)
  736|    707|  const int full_timestamp_flag =
  737|    707|      aom_rb_read_bit(rb);     // full_timestamp_flag f(1)
  738|    707|  aom_rb_read_bit(rb);         // discontinuity_flag (f1)
  739|    707|  aom_rb_read_bit(rb);         // cnt_dropped_flag f(1)
  740|    707|  aom_rb_read_literal(rb, 9);  // n_frames f(9)
  741|    707|  if (full_timestamp_flag) {
  ------------------
  |  Branch (741:7): [True: 280, False: 427]
  ------------------
  742|    280|    aom_rb_read_literal(rb, 6);  // seconds_value f(6)
  743|    280|    aom_rb_read_literal(rb, 6);  // minutes_value f(6)
  744|    280|    aom_rb_read_literal(rb, 5);  // hours_value f(5)
  745|    427|  } else {
  746|    427|    const int seconds_flag = aom_rb_read_bit(rb);  // seconds_flag f(1)
  747|    427|    if (seconds_flag) {
  ------------------
  |  Branch (747:9): [True: 233, False: 194]
  ------------------
  748|    233|      aom_rb_read_literal(rb, 6);                    // seconds_value f(6)
  749|    233|      const int minutes_flag = aom_rb_read_bit(rb);  // minutes_flag f(1)
  750|    233|      if (minutes_flag) {
  ------------------
  |  Branch (750:11): [True: 181, False: 52]
  ------------------
  751|    181|        aom_rb_read_literal(rb, 6);                  // minutes_value f(6)
  752|    181|        const int hours_flag = aom_rb_read_bit(rb);  // hours_flag f(1)
  753|    181|        if (hours_flag) {
  ------------------
  |  Branch (753:13): [True: 162, False: 19]
  ------------------
  754|    162|          aom_rb_read_literal(rb, 5);  // hours_value f(5)
  755|    162|        }
  756|    181|      }
  757|    233|    }
  758|    427|  }
  759|       |  // time_offset_length f(5)
  760|    707|  const int time_offset_length = aom_rb_read_literal(rb, 5);
  761|    707|  if (time_offset_length) {
  ------------------
  |  Branch (761:7): [True: 523, False: 184]
  ------------------
  762|       |    // time_offset_value f(time_offset_length)
  763|    523|    aom_rb_read_literal(rb, time_offset_length);
  764|    523|  }
  765|    707|}
obu.c:read_and_decode_one_tile_list:
  491|    801|                                              int *frame_decoding_finished) {
  492|    801|  AV1_COMMON *const cm = &pbi->common;
  493|    801|  uint32_t tile_list_payload_size = 0;
  494|    801|  const int num_tiles = cm->tiles.cols * cm->tiles.rows;
  495|    801|  const int start_tile = 0;
  496|    801|  const int end_tile = num_tiles - 1;
  497|    801|  int i = 0;
  498|       |
  499|       |  // Process the tile list info.
  500|    801|  pbi->output_frame_width_in_tiles_minus_1 = aom_rb_read_literal(rb, 8);
  501|    801|  pbi->output_frame_height_in_tiles_minus_1 = aom_rb_read_literal(rb, 8);
  502|    801|  pbi->tile_count_minus_1 = aom_rb_read_literal(rb, 16);
  503|       |
  504|       |  // The output frame is used to store the decoded tile list. The decoded tile
  505|       |  // list has to fit into 1 output frame.
  506|    801|  if ((pbi->tile_count_minus_1 + 1) >
  ------------------
  |  Branch (506:7): [True: 91, False: 710]
  ------------------
  507|    801|      (pbi->output_frame_width_in_tiles_minus_1 + 1) *
  508|    801|          (pbi->output_frame_height_in_tiles_minus_1 + 1)) {
  509|     91|    pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
  510|     91|    return 0;
  511|     91|  }
  512|       |
  513|    710|  if (pbi->tile_count_minus_1 > MAX_TILES - 1) {
  ------------------
  |  |  643|    710|#define MAX_TILES 512
  ------------------
  |  Branch (513:7): [True: 12, False: 698]
  ------------------
  514|     12|    pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
  515|     12|    return 0;
  516|     12|  }
  517|       |
  518|    698|  int tile_width, tile_height;
  519|    698|  if (!av1_get_uniform_tile_size(cm, &tile_width, &tile_height)) {
  ------------------
  |  Branch (519:7): [True: 88, False: 610]
  ------------------
  520|     88|    pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
  521|     88|    return 0;
  522|     88|  }
  523|    610|  const int tile_width_in_pixels = tile_width * MI_SIZE;
  ------------------
  |  |   40|    610|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|    610|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  524|    610|  const int tile_height_in_pixels = tile_height * MI_SIZE;
  ------------------
  |  |   40|    610|#define MI_SIZE (1 << MI_SIZE_LOG2)
  |  |  ------------------
  |  |  |  |   39|    610|#define MI_SIZE_LOG2 2
  |  |  ------------------
  ------------------
  525|       |
  526|       |  // Allocate output frame buffer for the tile list.
  527|    610|  alloc_tile_list_buffer(pbi, tile_width_in_pixels, tile_height_in_pixels);
  528|       |
  529|    610|  uint32_t tile_list_info_bytes = 4;
  530|    610|  tile_list_payload_size += tile_list_info_bytes;
  531|    610|  data += tile_list_info_bytes;
  532|       |
  533|    610|  int tile_idx = 0;
  534|    610|  for (i = 0; i <= pbi->tile_count_minus_1; i++) {
  ------------------
  |  Branch (534:15): [True: 437, False: 173]
  ------------------
  535|       |    // Process 1 tile.
  536|       |    // Reset the bit reader.
  537|    437|    rb->bit_offset = 0;
  538|    437|    rb->bit_buffer = data;
  539|       |
  540|       |    // Read out the tile info.
  541|    437|    uint32_t tile_info_bytes = 5;
  542|       |    // Set reference for each tile.
  543|    437|    int ref_idx = aom_rb_read_literal(rb, 8);
  544|    437|    if (ref_idx >= MAX_EXTERNAL_REFERENCES) {
  ------------------
  |  |  642|    437|#define MAX_EXTERNAL_REFERENCES 128
  ------------------
  |  Branch (544:9): [True: 80, False: 357]
  ------------------
  545|     80|      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
  546|     80|      return 0;
  547|     80|    }
  548|    357|    av1_set_reference_dec(cm, cm->remapped_ref_idx[0], 1,
  549|    357|                          &pbi->ext_refs.refs[ref_idx]);
  550|       |
  551|    357|    pbi->dec_tile_row = aom_rb_read_literal(rb, 8);
  552|    357|    pbi->dec_tile_col = aom_rb_read_literal(rb, 8);
  553|    357|    if (pbi->dec_tile_row < 0 || pbi->dec_tile_col < 0 ||
  ------------------
  |  Branch (553:9): [True: 357, False: 0]
  |  Branch (553:34): [True: 0, False: 0]
  ------------------
  554|    357|        pbi->dec_tile_row >= cm->tiles.rows ||
  ------------------
  |  Branch (554:9): [True: 0, False: 0]
  ------------------
  555|    357|        pbi->dec_tile_col >= cm->tiles.cols) {
  ------------------
  |  Branch (555:9): [True: 0, False: 0]
  ------------------
  556|      0|      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
  557|      0|      return 0;
  558|      0|    }
  559|       |
  560|    357|    pbi->coded_tile_data_size = aom_rb_read_literal(rb, 16) + 1;
  561|    357|    data += tile_info_bytes;
  562|    357|    if ((size_t)(data_end - data) < pbi->coded_tile_data_size) {
  ------------------
  |  Branch (562:9): [True: 0, False: 357]
  ------------------
  563|      0|      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
  564|      0|      return 0;
  565|      0|    }
  566|       |
  567|    357|    av1_decode_tg_tiles_and_wrapup(pbi, data, data + pbi->coded_tile_data_size,
  568|    357|                                   p_data_end, start_tile, end_tile, 0);
  569|    357|    uint32_t tile_payload_size = (uint32_t)(*p_data_end - data);
  570|       |
  571|    357|    tile_list_payload_size += tile_info_bytes + tile_payload_size;
  572|       |
  573|       |    // Update data ptr for next tile decoding.
  574|    357|    data = *p_data_end;
  575|    357|    assert(data <= data_end);
  576|       |
  577|       |    // Copy the decoded tile to the tile list output buffer.
  578|      0|    copy_decoded_tile_to_tile_list_buffer(pbi, tile_idx, tile_width_in_pixels,
  579|      0|                                          tile_height_in_pixels);
  580|      0|    tile_idx++;
  581|      0|  }
  582|       |
  583|    173|  *frame_decoding_finished = 1;
  584|    173|  return tile_list_payload_size;
  585|    610|}
obu.c:alloc_tile_list_buffer:
  376|    610|                                   int tile_height_in_pixels) {
  377|       |  // The resolution of the output frame is read out from the bitstream. The data
  378|       |  // are stored in the order of Y plane, U plane and V plane. As an example, for
  379|       |  // image format 4:2:0, the output frame of U plane and V plane is 1/4 of the
  380|       |  // output frame.
  381|    610|  AV1_COMMON *const cm = &pbi->common;
  382|    610|  const int output_frame_width =
  383|    610|      (pbi->output_frame_width_in_tiles_minus_1 + 1) * tile_width_in_pixels;
  384|    610|  const int output_frame_height =
  385|    610|      (pbi->output_frame_height_in_tiles_minus_1 + 1) * tile_height_in_pixels;
  386|       |  // The output frame is used to store the decoded tile list. The decoded tile
  387|       |  // list has to fit into 1 output frame.
  388|    610|  assert((pbi->tile_count_minus_1 + 1) <=
  389|    610|         (pbi->output_frame_width_in_tiles_minus_1 + 1) *
  390|    610|             (pbi->output_frame_height_in_tiles_minus_1 + 1));
  391|       |
  392|       |  // Allocate the tile list output buffer.
  393|       |  // Note: if cm->seq_params->use_highbitdepth is 1 and
  394|       |  // cm->seq_params->bit_depth is 8, we could allocate less memory, namely, 8
  395|       |  // bits/pixel.
  396|    610|  if (aom_alloc_frame_buffer(&pbi->tile_list_outbuf, output_frame_width,
  ------------------
  |  Branch (396:7): [True: 173, False: 437]
  ------------------
  397|    610|                             output_frame_height, cm->seq_params->subsampling_x,
  398|    610|                             cm->seq_params->subsampling_y,
  399|    610|                             (cm->seq_params->use_highbitdepth &&
  ------------------
  |  Branch (399:31): [True: 211, False: 399]
  ------------------
  400|    610|                              (cm->seq_params->bit_depth > AOM_BITS_8)),
  ------------------
  |  Branch (400:31): [True: 211, False: 0]
  ------------------
  401|    610|                             0, cm->features.byte_alignment, false, 0))
  402|    173|    aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
  403|    173|                       "Failed to allocate the tile list output buffer");
  404|    610|}
obu.c:read_padding:
  851|  3.39k|                           size_t sz) {
  852|       |  // The spec allows a padding OBU to be header-only (i.e., obu_size = 0). So
  853|       |  // check trailing bits only if sz > 0.
  854|  3.39k|  if (sz > 0) {
  ------------------
  |  Branch (854:7): [True: 68, False: 3.33k]
  ------------------
  855|       |    // The payload of a padding OBU is byte aligned. Therefore the first
  856|       |    // trailing byte should be 0x80. See https://crbug.com/aomedia/2393.
  857|     68|    const uint8_t last_nonzero_byte = get_last_nonzero_byte(data, sz);
  858|     68|    if (last_nonzero_byte != 0x80) {
  ------------------
  |  Branch (858:9): [True: 44, False: 24]
  ------------------
  859|     44|      cm->error->error_code = AOM_CODEC_CORRUPT_FRAME;
  860|     44|      return 0;
  861|     44|    }
  862|     68|  }
  863|  3.35k|  return sz;
  864|  3.39k|}
obu.c:get_last_nonzero_byte:
  774|  6.43k|static uint8_t get_last_nonzero_byte(const uint8_t *data, size_t sz) {
  775|       |  // Scan backward and return on the first nonzero byte.
  776|  6.43k|  size_t i = sz;
  777|  26.7k|  while (i != 0) {
  ------------------
  |  Branch (777:10): [True: 25.9k, False: 772]
  ------------------
  778|  25.9k|    --i;
  779|  25.9k|    if (data[i] != 0) return data[i];
  ------------------
  |  Branch (779:9): [True: 5.65k, False: 20.3k]
  ------------------
  780|  25.9k|  }
  781|    772|  return 0;
  782|  6.43k|}

LLVMFuzzerTestOneInput:
   32|  16.1k|extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   33|  16.1k|  if (size <= IVF_FILE_HDR_SZ) {
  ------------------
  |  |   28|  16.1k|#define IVF_FILE_HDR_SZ 32
  ------------------
  |  Branch (33:7): [True: 9, False: 16.1k]
  ------------------
   34|      9|    return 0;
   35|      9|  }
   36|       |
   37|       |  // Abusing the four unused bytes at the end of the IVF file header as a source
   38|       |  // of random bits.
   39|  16.1k|  unsigned int tile_mode = (data[IVF_FILE_HDR_SZ - 1] & 2) != 0;
  ------------------
  |  |   28|  16.1k|#define IVF_FILE_HDR_SZ 32
  ------------------
   40|  16.1k|  unsigned int ext_tile_debug = (data[IVF_FILE_HDR_SZ - 1] & 4) != 0;
  ------------------
  |  |   28|  16.1k|#define IVF_FILE_HDR_SZ 32
  ------------------
   41|  16.1k|  unsigned int is_annexb = (data[IVF_FILE_HDR_SZ - 1] & 8) != 0;
  ------------------
  |  |   28|  16.1k|#define IVF_FILE_HDR_SZ 32
  ------------------
   42|  16.1k|  int output_all_layers = (data[IVF_FILE_HDR_SZ - 1] & 0x10) != 0;
  ------------------
  |  |   28|  16.1k|#define IVF_FILE_HDR_SZ 32
  ------------------
   43|  16.1k|  int operating_point = data[IVF_FILE_HDR_SZ - 2] & 0x1F;
  ------------------
  |  |   28|  16.1k|#define IVF_FILE_HDR_SZ 32
  ------------------
   44|       |
   45|  16.1k|  aom_codec_iface_t *codec_interface = aom_codec_av1_dx();
   46|  16.1k|  aom_codec_ctx_t codec;
   47|       |  // Set thread count in the range [1, 64].
   48|  16.1k|  const unsigned int threads = (data[IVF_FILE_HDR_SZ] & 0x3f) + 1;
  ------------------
  |  |   28|  16.1k|#define IVF_FILE_HDR_SZ 32
  ------------------
   49|  16.1k|  aom_codec_dec_cfg_t cfg = { threads, 0, 0, !FORCE_HIGHBITDEPTH_DECODING };
  ------------------
  |  |   79|  16.1k|#define FORCE_HIGHBITDEPTH_DECODING 0
  ------------------
   50|  16.1k|  if (aom_codec_dec_init(&codec, codec_interface, &cfg, 0)) {
  ------------------
  |  |  130|  16.1k|  aom_codec_dec_init_ver(ctx, iface, cfg, flags, AOM_DECODER_ABI_VERSION)
  |  |  ------------------
  |  |  |  |   45|  16.1k|  (6 + AOM_CODEC_ABI_VERSION) /**<\hideinitializer*/
  |  |  |  |  ------------------
  |  |  |  |  |  |  152|  16.1k|#define AOM_CODEC_ABI_VERSION (7 + AOM_IMAGE_ABI_VERSION) /**<\hideinitializer*/
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   33|  16.1k|#define AOM_IMAGE_ABI_VERSION (9) /**<\hideinitializer*/
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  |  |  |  Branch (130:3): [True: 0, False: 16.1k]
  |  |  ------------------
  ------------------
   51|      0|    return 0;
   52|      0|  }
   53|  16.1k|  AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, tile_mode);
  ------------------
  |  |  543|  16.1k|  aom_codec_control_typechecked_##id(ctx, id, data) /**<\hideinitializer*/
  ------------------
   54|  16.1k|  AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_EXT_TILE_DEBUG, ext_tile_debug);
  ------------------
  |  |  543|  16.1k|  aom_codec_control_typechecked_##id(ctx, id, data) /**<\hideinitializer*/
  ------------------
   55|  16.1k|  AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_IS_ANNEXB, is_annexb);
  ------------------
  |  |  543|  16.1k|  aom_codec_control_typechecked_##id(ctx, id, data) /**<\hideinitializer*/
  ------------------
   56|  16.1k|  AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_OUTPUT_ALL_LAYERS,
  ------------------
  |  |  543|  16.1k|  aom_codec_control_typechecked_##id(ctx, id, data) /**<\hideinitializer*/
  ------------------
   57|  16.1k|                                output_all_layers);
   58|  16.1k|  AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_OPERATING_POINT,
  ------------------
  |  |  543|  16.1k|  aom_codec_control_typechecked_##id(ctx, id, data) /**<\hideinitializer*/
  ------------------
   59|  16.1k|                                operating_point);
   60|       |
   61|  16.1k|  data += IVF_FILE_HDR_SZ;
  ------------------
  |  |   28|  16.1k|#define IVF_FILE_HDR_SZ 32
  ------------------
   62|  16.1k|  size -= IVF_FILE_HDR_SZ;
  ------------------
  |  |   28|  16.1k|#define IVF_FILE_HDR_SZ 32
  ------------------
   63|       |
   64|   308k|  while (size > IVF_FRAME_HDR_SZ) {
  ------------------
  |  |   27|   308k|#define IVF_FRAME_HDR_SZ (4 + 8) /* 4 byte size + 8 byte timestamp */
  ------------------
  |  Branch (64:10): [True: 292k, False: 16.1k]
  ------------------
   65|   292k|    size_t frame_size = mem_get_le32(data);
  ------------------
  |  |  124|   292k|#define mem_get_le32 mem_ops_wrap_symbol(mem_get_le32)
  |  |  ------------------
  |  |  |  |   51|   292k|#define mem_ops_wrap_symbol(fn) mem_ops_wrap_symbol2(fn, MEM_VALUE_T)
  |  |  |  |  ------------------
  |  |  |  |  |  |   53|   292k|#define mem_ops_wrap_symbol2(fn, typ) mem_ops_wrap_symbol3(fn, typ)
  |  |  |  |  |  |  ------------------
  |  |  |  |  |  |  |  |   55|   292k|#define mem_ops_wrap_symbol3(fn, typ) fn##_as_##typ
  |  |  |  |  |  |  ------------------
  |  |  |  |  ------------------
  |  |  ------------------
  ------------------
   66|   292k|    size -= IVF_FRAME_HDR_SZ;
  ------------------
  |  |   27|   292k|#define IVF_FRAME_HDR_SZ (4 + 8) /* 4 byte size + 8 byte timestamp */
  ------------------
   67|   292k|    data += IVF_FRAME_HDR_SZ;
  ------------------
  |  |   27|   292k|#define IVF_FRAME_HDR_SZ (4 + 8) /* 4 byte size + 8 byte timestamp */
  ------------------
   68|   292k|    frame_size = std::min(size, frame_size);
   69|       |
   70|   292k|    aom_codec_stream_info_t stream_info;
   71|   292k|    stream_info.is_annexb = is_annexb;
   72|   292k|    aom_codec_err_t err =
   73|   292k|        aom_codec_peek_stream_info(codec_interface, data, size, &stream_info);
   74|   292k|    static_cast<void>(err);
   75|       |
   76|   292k|    err = aom_codec_decode(&codec, data, frame_size, nullptr);
   77|   292k|    static_cast<void>(err);
   78|   292k|    aom_codec_iter_t iter = nullptr;
   79|   292k|    aom_image_t *img = nullptr;
   80|   354k|    while ((img = aom_codec_get_frame(&codec, &iter)) != nullptr) {
  ------------------
  |  Branch (80:12): [True: 62.4k, False: 292k]
  ------------------
   81|  62.4k|    }
   82|   292k|    data += frame_size;
   83|   292k|    size -= frame_size;
   84|   292k|  }
   85|  16.1k|  aom_codec_destroy(&codec);
   86|  16.1k|  return 0;
   87|  16.1k|}

convolve_2d_avx2.c:loadu_8bit_16x2_avx2:
   70|  8.77M|                                           const ptrdiff_t strideInByte) {
   71|  8.77M|  const __m128i src0 = _mm_loadu_si128((__m128i *)src);
   72|  8.77M|  const __m128i src1 =
   73|  8.77M|      _mm_loadu_si128((__m128i *)((uint8_t *)src + strideInByte));
   74|  8.77M|  return _mm256_setr_m128i(src0, src1);
  ------------------
  |  |   29|  8.77M|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  8.77M|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
   75|  8.77M|}
convolve_2d_avx2.c:storeu_u8_16x2_avx2:
   96|  1.71M|                                       const ptrdiff_t stride) {
   97|  1.71M|  storeu_8bit_16x2_avx2(src, dst, sizeof(*dst) * stride);
   98|  1.71M|}
convolve_2d_avx2.c:storeu_8bit_16x2_avx2:
   88|  1.71M|                                         const ptrdiff_t strideInByte) {
   89|  1.71M|  const __m128i d0 = _mm256_castsi256_si128(src);
   90|  1.71M|  const __m128i d1 = _mm256_extracti128_si256(src, 1);
   91|  1.71M|  _mm_storeu_si128((__m128i *)dst, d0);
   92|  1.71M|  _mm_storeu_si128((__m128i *)((uint8_t *)dst + strideInByte), d1);
   93|  1.71M|}
convolve_avx2.c:storeu_u8_16x2_avx2:
   96|  1.64M|                                       const ptrdiff_t stride) {
   97|  1.64M|  storeu_8bit_16x2_avx2(src, dst, sizeof(*dst) * stride);
   98|  1.64M|}
convolve_avx2.c:storeu_8bit_16x2_avx2:
   88|  1.64M|                                         const ptrdiff_t strideInByte) {
   89|  1.64M|  const __m128i d0 = _mm256_castsi256_si128(src);
   90|  1.64M|  const __m128i d1 = _mm256_extracti128_si256(src, 1);
   91|  1.64M|  _mm_storeu_si128((__m128i *)dst, d0);
   92|  1.64M|  _mm_storeu_si128((__m128i *)((uint8_t *)dst + strideInByte), d1);
   93|  1.64M|}
convolve_avx2.c:loadu_8bit_16x2_avx2:
   70|  2.68M|                                           const ptrdiff_t strideInByte) {
   71|  2.68M|  const __m128i src0 = _mm_loadu_si128((__m128i *)src);
   72|  2.68M|  const __m128i src1 =
   73|  2.68M|      _mm_loadu_si128((__m128i *)((uint8_t *)src + strideInByte));
   74|  2.68M|  return _mm256_setr_m128i(src0, src1);
  ------------------
  |  |   29|  2.68M|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  2.68M|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
   75|  2.68M|}

convolve_2d_avx2.c:load_u8_4x2_sse4_1:
   30|  24.6k|                                         const ptrdiff_t stride) {
   31|  24.6k|  return load8bit_4x2_sse4_1(src, sizeof(*src) * stride);
   32|  24.6k|}
convolve_2d_avx2.c:load8bit_4x2_sse4_1:
   24|  24.6k|                                          const ptrdiff_t strideInByte) {
   25|  24.6k|  const __m128i s = _mm_cvtsi32_si128(loadu_int32(src));
   26|  24.6k|  return _mm_insert_epi32(s, loadu_int32((uint8_t *)src + strideInByte), 1);
   27|  24.6k|}
convolve_avx2.c:load_u8_4x2_sse4_1:
   30|  9.85k|                                         const ptrdiff_t stride) {
   31|  9.85k|  return load8bit_4x2_sse4_1(src, sizeof(*src) * stride);
   32|  9.85k|}
convolve_avx2.c:load8bit_4x2_sse4_1:
   24|  9.85k|                                          const ptrdiff_t strideInByte) {
   25|  9.85k|  const __m128i s = _mm_cvtsi32_si128(loadu_int32(src));
   26|  9.85k|  return _mm_insert_epi32(s, loadu_int32((uint8_t *)src + strideInByte), 1);
   27|  9.85k|}

convolve_2d_avx2.c:av1_convolve_2d_sr_specialized_avx2:
 1146|  1.82M|    const int32_t subpel_y_q4, ConvolveParams *conv_params) {
 1147|  1.82M|  static const Convolve2dSrHorTapFunc
 1148|  1.82M|      convolve_2d_sr_hor_tap_func_table[MAX_FILTER_TAP + 1] = {
 1149|  1.82M|        NULL,
 1150|  1.82M|        NULL,
 1151|  1.82M|        convolve_2d_sr_hor_2tap_avx2,
 1152|  1.82M|        NULL,
 1153|  1.82M|        convolve_2d_sr_hor_4tap_ssse3,
 1154|  1.82M|        NULL,
 1155|  1.82M|        convolve_2d_sr_hor_6tap_avx2,
 1156|  1.82M|        NULL,
 1157|  1.82M|        convolve_2d_sr_hor_8tap_avx2
 1158|  1.82M|      };
 1159|  1.82M|  static const Convolve2dSrVerTapFunc
 1160|  1.82M|      convolve_2d_sr_ver_tap_func_table[MAX_FILTER_TAP + 1] = {
 1161|  1.82M|        NULL,
 1162|  1.82M|        convolve_2d_sr_ver_2tap_half_avx2,
 1163|  1.82M|        convolve_2d_sr_ver_2tap_avx2,
 1164|  1.82M|        convolve_2d_sr_ver_4tap_avx2,
 1165|  1.82M|        convolve_2d_sr_ver_4tap_avx2,
 1166|  1.82M|        convolve_2d_sr_ver_6tap_avx2,
 1167|  1.82M|        convolve_2d_sr_ver_6tap_avx2,
 1168|  1.82M|        convolve_2d_sr_ver_8tap_avx2,
 1169|  1.82M|        convolve_2d_sr_ver_8tap_avx2
 1170|  1.82M|      };
 1171|  1.82M|  const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_q4);
 1172|  1.82M|  const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_q4);
 1173|       |
 1174|  1.82M|  assert(tap_x != 12 && tap_y != 12);
 1175|       |
 1176|  1.82M|  const uint8_t *src_ptr = src - ((tap_y >> 1) - 1) * src_stride;
 1177|       |  // Note: im_block is 8-pixel interlaced for width 32 and up, to avoid data
 1178|       |  //       permutation.
 1179|  1.82M|  DECLARE_ALIGNED(32, int16_t,
  ------------------
  |  |   19|  1.82M|#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
  ------------------
 1180|  1.82M|                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
 1181|       |
 1182|  1.82M|  (void)conv_params;
 1183|       |
 1184|  1.82M|  assert(conv_params->round_0 == 3);
 1185|  1.82M|  assert(conv_params->round_1 == 11);
 1186|       |
 1187|       |  // horizontal filter
 1188|  1.82M|  int32_t hh = h + tap_y;
 1189|  1.82M|  assert(!(hh % 2));
 1190|       |
 1191|  1.82M|  convolve_2d_sr_hor_tap_func_table[tap_x](
 1192|  1.82M|      src_ptr, src_stride, w, hh, filter_params_x, subpel_x_q4, im_block);
 1193|       |
 1194|       |  // vertical filter
 1195|  1.82M|  convolve_2d_sr_ver_tap_func_table[tap_y - (subpel_y_q4 == 8)](
 1196|  1.82M|      im_block, w, h, filter_params_y, subpel_y_q4, dst, dst_stride);
 1197|  1.82M|}
convolve_2d_avx2.c:convolve_2d_sr_hor_2tap_avx2:
   20|  70.2k|    const int32_t subpel_x_q4, int16_t *const im_block) {
   21|  70.2k|  const uint8_t *src_ptr = src;
   22|  70.2k|  int32_t y = h;
   23|  70.2k|  int16_t *im = im_block;
   24|       |
   25|  70.2k|  if (w <= 8) {
  ------------------
  |  Branch (25:7): [True: 55.7k, False: 14.4k]
  ------------------
   26|  55.7k|    __m128i coeffs_128;
   27|       |
   28|  55.7k|    prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4, &coeffs_128);
   29|       |
   30|  55.7k|    if (w == 2) {
  ------------------
  |  Branch (30:9): [True: 8.15k, False: 47.6k]
  ------------------
   31|  24.6k|      do {
   32|  24.6k|        const __m128i r =
   33|  24.6k|            x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, &coeffs_128);
   34|  24.6k|        xy_x_round_store_2x2_sse2(r, im);
   35|  24.6k|        src_ptr += 2 * src_stride;
   36|  24.6k|        im += 2 * 2;
   37|  24.6k|        y -= 2;
   38|  24.6k|      } while (y);
  ------------------
  |  Branch (38:16): [True: 16.4k, False: 8.15k]
  ------------------
   39|  47.6k|    } else if (w == 4) {
  ------------------
  |  Branch (39:16): [True: 26.8k, False: 20.7k]
  ------------------
   40|   106k|      do {
   41|   106k|        const __m128i r =
   42|   106k|            x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, &coeffs_128);
   43|   106k|        xy_x_round_store_4x2_sse2(r, im);
   44|   106k|        src_ptr += 2 * src_stride;
   45|   106k|        im += 2 * 4;
   46|   106k|        y -= 2;
   47|   106k|      } while (y);
  ------------------
  |  Branch (47:16): [True: 79.8k, False: 26.8k]
  ------------------
   48|  26.8k|    } else {
   49|  20.7k|      assert(w == 8);
   50|       |
   51|  90.6k|      do {
   52|  90.6k|        __m128i r[2];
   53|       |
   54|  90.6k|        x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, &coeffs_128, r);
   55|  90.6k|        xy_x_round_store_8x2_sse2(r, im);
   56|  90.6k|        src_ptr += 2 * src_stride;
   57|  90.6k|        im += 2 * 8;
   58|  90.6k|        y -= 2;
   59|  90.6k|      } while (y);
  ------------------
  |  Branch (59:16): [True: 69.8k, False: 20.7k]
  ------------------
   60|  20.7k|    }
   61|  55.7k|  } else {
   62|  14.4k|    __m256i coeffs_256;
   63|       |
   64|  14.4k|    prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, &coeffs_256);
   65|       |
   66|  14.4k|    if (w == 16) {
  ------------------
  |  Branch (66:9): [True: 9.02k, False: 5.44k]
  ------------------
   67|  56.7k|      do {
   68|  56.7k|        __m256i r[2];
   69|       |
   70|  56.7k|        x_convolve_2tap_16x2_avx2(src_ptr, src_stride, &coeffs_256, r);
   71|  56.7k|        xy_x_round_store_32_avx2(r, im);
   72|  56.7k|        src_ptr += 2 * src_stride;
   73|  56.7k|        im += 2 * 16;
   74|  56.7k|        y -= 2;
   75|  56.7k|      } while (y);
  ------------------
  |  Branch (75:16): [True: 47.7k, False: 9.02k]
  ------------------
   76|  9.02k|    } else if (w == 32) {
  ------------------
  |  Branch (76:16): [True: 3.31k, False: 2.13k]
  ------------------
   77|  84.7k|      do {
   78|  84.7k|        xy_x_2tap_32_avx2(src_ptr, &coeffs_256, im);
   79|  84.7k|        src_ptr += src_stride;
   80|  84.7k|        im += 32;
   81|  84.7k|      } while (--y);
  ------------------
  |  Branch (81:16): [True: 81.4k, False: 3.31k]
  ------------------
   82|  3.31k|    } else if (w == 64) {
  ------------------
  |  Branch (82:16): [True: 1.64k, False: 493]
  ------------------
   83|  77.7k|      do {
   84|  77.7k|        xy_x_2tap_32_avx2(src_ptr + 0 * 32, &coeffs_256, im + 0 * 32);
   85|  77.7k|        xy_x_2tap_32_avx2(src_ptr + 1 * 32, &coeffs_256, im + 1 * 32);
   86|  77.7k|        src_ptr += src_stride;
   87|  77.7k|        im += 64;
   88|  77.7k|      } while (--y);
  ------------------
  |  Branch (88:16): [True: 76.1k, False: 1.64k]
  ------------------
   89|  1.64k|    } else {
   90|    493|      assert(w == 128);
   91|       |
   92|  47.5k|      do {
   93|  47.5k|        xy_x_2tap_32_avx2(src_ptr + 0 * 32, &coeffs_256, im + 0 * 32);
   94|  47.5k|        xy_x_2tap_32_avx2(src_ptr + 1 * 32, &coeffs_256, im + 1 * 32);
   95|  47.5k|        xy_x_2tap_32_avx2(src_ptr + 2 * 32, &coeffs_256, im + 2 * 32);
   96|  47.5k|        xy_x_2tap_32_avx2(src_ptr + 3 * 32, &coeffs_256, im + 3 * 32);
   97|  47.5k|        src_ptr += src_stride;
   98|  47.5k|        im += 128;
   99|  47.5k|      } while (--y);
  ------------------
  |  Branch (99:16): [True: 47.0k, False: 494]
  ------------------
  100|    494|    }
  101|  14.4k|  }
  102|  70.2k|}
convolve_2d_avx2.c:convolve_2d_sr_hor_4tap_ssse3:
  107|   805k|    const int32_t subpel_x_q4, int16_t *const im_block) {
  108|   805k|  const uint8_t *src_ptr = src - 1;
  109|   805k|  int32_t y = h;
  110|   805k|  int16_t *im = im_block;
  111|       |
  112|   805k|  if (w <= 4) {
  ------------------
  |  Branch (112:7): [True: 747k, False: 58.6k]
  ------------------
  113|   747k|    __m128i coeffs_128[2];
  114|       |
  115|   747k|    prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
  116|   747k|    if (w == 2) {
  ------------------
  |  Branch (116:9): [True: 149k, False: 598k]
  ------------------
  117|   760k|      do {
  118|   760k|        const __m128i r =
  119|   760k|            x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
  120|   760k|        xy_x_round_store_2x2_sse2(r, im);
  121|   760k|        src_ptr += 2 * src_stride;
  122|   760k|        im += 2 * 2;
  123|   760k|        y -= 2;
  124|   760k|      } while (y);
  ------------------
  |  Branch (124:16): [True: 611k, False: 149k]
  ------------------
  125|   598k|    } else if (w == 4) {
  ------------------
  |  Branch (125:16): [True: 598k, False: 9]
  ------------------
  126|  3.42M|      do {
  127|  3.42M|        const __m128i r =
  128|  3.42M|            x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
  129|  3.42M|        xy_x_round_store_4x2_sse2(r, im);
  130|  3.42M|        src_ptr += 2 * src_stride;
  131|  3.42M|        im += 2 * 4;
  132|  3.42M|        y -= 2;
  133|  3.42M|      } while (y);
  ------------------
  |  Branch (133:16): [True: 2.82M, False: 598k]
  ------------------
  134|   598k|    }
  135|   747k|  } else {
  136|       |    // TODO(chiyotsai@google.com): Add better optimization
  137|  58.6k|    __m256i coeffs_256[2], filt_256[2];
  138|       |
  139|  58.6k|    prepare_half_coeffs_4tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
  140|  58.6k|    filt_256[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
  141|  58.6k|    filt_256[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
  142|       |
  143|  58.6k|    if (w == 8) {
  ------------------
  |  Branch (143:9): [True: 33.1k, False: 25.5k]
  ------------------
  144|   200k|      do {
  145|   200k|        __m256i res =
  146|   200k|            x_convolve_4tap_8x2_avx2(src_ptr, src_stride, coeffs_256, filt_256);
  147|   200k|        xy_x_round_store_8x2_avx2(res, im);
  148|       |
  149|   200k|        src_ptr += 2 * src_stride;
  150|   200k|        im += 2 * 8;
  151|   200k|        y -= 2;
  152|   200k|      } while (y);
  ------------------
  |  Branch (152:16): [True: 167k, False: 33.1k]
  ------------------
  153|  33.1k|    } else if (w == 16) {
  ------------------
  |  Branch (153:16): [True: 18.3k, False: 7.13k]
  ------------------
  154|   141k|      do {
  155|   141k|        __m256i r[2];
  156|       |
  157|   141k|        x_convolve_4tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r);
  158|   141k|        xy_x_round_store_32_avx2(r, im);
  159|   141k|        src_ptr += 2 * src_stride;
  160|   141k|        im += 2 * 16;
  161|   141k|        y -= 2;
  162|   141k|      } while (y);
  ------------------
  |  Branch (162:16): [True: 123k, False: 18.3k]
  ------------------
  163|  18.3k|    } else if (w == 32) {
  ------------------
  |  Branch (163:16): [True: 5.04k, False: 2.08k]
  ------------------
  164|   133k|      do {
  165|   133k|        xy_x_4tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
  166|       |
  167|   133k|        src_ptr += src_stride;
  168|   133k|        im += 32;
  169|   133k|      } while (--y);
  ------------------
  |  Branch (169:16): [True: 128k, False: 5.04k]
  ------------------
  170|  5.04k|    } else if (w == 64) {
  ------------------
  |  Branch (170:16): [True: 1.71k, False: 377]
  ------------------
  171|  92.9k|      do {
  172|  92.9k|        xy_x_4tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
  173|  92.9k|        xy_x_4tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
  174|  92.9k|        src_ptr += src_stride;
  175|  92.9k|        im += 64;
  176|  92.9k|      } while (--y);
  ------------------
  |  Branch (176:16): [True: 91.2k, False: 1.71k]
  ------------------
  177|  1.71k|    } else {
  178|    377|      assert(w == 128);
  179|       |
  180|  44.0k|      do {
  181|  44.0k|        xy_x_4tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
  182|  44.0k|        xy_x_4tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
  183|  44.0k|        xy_x_4tap_32_avx2(src_ptr + 64, coeffs_256, filt_256, im + 64);
  184|  44.0k|        xy_x_4tap_32_avx2(src_ptr + 96, coeffs_256, filt_256, im + 96);
  185|  44.0k|        src_ptr += src_stride;
  186|  44.0k|        im += 128;
  187|  44.0k|      } while (--y);
  ------------------
  |  Branch (187:16): [True: 43.6k, False: 390]
  ------------------
  188|    390|    }
  189|  58.6k|  }
  190|   805k|}
convolve_2d_avx2.c:convolve_2d_sr_hor_6tap_avx2:
  195|   890k|    const int32_t subpel_x_q4, int16_t *const im_block) {
  196|   890k|  const uint8_t *src_ptr = src - 2;
  197|   890k|  int32_t y = h;
  198|   890k|  int16_t *im = im_block;
  199|       |
  200|   890k|  if (w <= 4) {
  ------------------
  |  Branch (200:7): [True: 0, False: 890k]
  ------------------
  201|      0|    __m128i coeffs_128[3];
  202|       |
  203|      0|    prepare_half_coeffs_6tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
  204|      0|    if (w == 2) {
  ------------------
  |  Branch (204:9): [True: 0, False: 0]
  ------------------
  205|      0|      do {
  206|      0|        const __m128i r =
  207|      0|            x_convolve_6tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
  208|      0|        xy_x_round_store_2x2_sse2(r, im);
  209|      0|        src_ptr += 2 * src_stride;
  210|      0|        im += 2 * 2;
  211|      0|        y -= 2;
  212|      0|      } while (y);
  ------------------
  |  Branch (212:16): [True: 0, False: 0]
  ------------------
  213|      0|    } else if (w == 4) {
  ------------------
  |  Branch (213:16): [True: 0, False: 0]
  ------------------
  214|      0|      do {
  215|      0|        const __m128i r =
  216|      0|            x_convolve_6tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
  217|      0|        xy_x_round_store_4x2_sse2(r, im);
  218|      0|        src_ptr += 2 * src_stride;
  219|      0|        im += 2 * 4;
  220|      0|        y -= 2;
  221|      0|      } while (y);
  ------------------
  |  Branch (221:16): [True: 0, False: 0]
  ------------------
  222|      0|    }
  223|   890k|  } else {
  224|   890k|    __m256i coeffs_256[3], filt_256[3];
  225|       |
  226|   890k|    filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
  227|   890k|    filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
  228|   890k|    filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
  229|       |
  230|   890k|    prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
  231|       |
  232|   890k|    if (w == 8) {
  ------------------
  |  Branch (232:9): [True: 528k, False: 361k]
  ------------------
  233|  3.35M|      do {
  234|  3.35M|        const __m256i res =
  235|  3.35M|            x_convolve_6tap_8x2_avx2(src_ptr, src_stride, coeffs_256, filt_256);
  236|  3.35M|        xy_x_round_store_8x2_avx2(res, im);
  237|       |
  238|  3.35M|        src_ptr += 2 * src_stride;
  239|  3.35M|        im += 2 * 8;
  240|  3.35M|        y -= 2;
  241|  3.35M|      } while (y);
  ------------------
  |  Branch (241:16): [True: 2.82M, False: 528k]
  ------------------
  242|   528k|    } else if (w == 16) {
  ------------------
  |  Branch (242:16): [True: 282k, False: 79.2k]
  ------------------
  243|  2.23M|      do {
  244|  2.23M|        __m256i r[2];
  245|       |
  246|  2.23M|        x_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r);
  247|  2.23M|        xy_x_round_store_32_avx2(r, im);
  248|  2.23M|        src_ptr += 2 * src_stride;
  249|  2.23M|        im += 2 * 16;
  250|  2.23M|        y -= 2;
  251|  2.23M|      } while (y);
  ------------------
  |  Branch (251:16): [True: 1.95M, False: 282k]
  ------------------
  252|   282k|    } else if (w == 32) {
  ------------------
  |  Branch (252:16): [True: 65.5k, False: 13.7k]
  ------------------
  253|  1.68M|      do {
  254|  1.68M|        xy_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
  255|  1.68M|        src_ptr += src_stride;
  256|  1.68M|        im += 32;
  257|  1.68M|      } while (--y);
  ------------------
  |  Branch (257:16): [True: 1.62M, False: 65.5k]
  ------------------
  258|  65.5k|    } else if (w == 64) {
  ------------------
  |  Branch (258:16): [True: 11.9k, False: 1.82k]
  ------------------
  259|   686k|      do {
  260|   686k|        xy_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
  261|   686k|        xy_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
  262|   686k|        src_ptr += src_stride;
  263|   686k|        im += 64;
  264|   686k|      } while (--y);
  ------------------
  |  Branch (264:16): [True: 674k, False: 11.9k]
  ------------------
  265|  11.9k|    } else {
  266|  1.82k|      assert(w == 128);
  267|       |
  268|   231k|      do {
  269|   231k|        xy_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
  270|   231k|        xy_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
  271|   231k|        xy_x_6tap_32_avx2(src_ptr + 64, coeffs_256, filt_256, im + 64);
  272|   231k|        xy_x_6tap_32_avx2(src_ptr + 96, coeffs_256, filt_256, im + 96);
  273|   231k|        src_ptr += src_stride;
  274|   231k|        im += 128;
  275|   231k|      } while (--y);
  ------------------
  |  Branch (275:16): [True: 229k, False: 1.91k]
  ------------------
  276|  1.91k|    }
  277|   890k|  }
  278|   890k|}
convolve_2d_avx2.c:convolve_2d_sr_hor_8tap_avx2:
  283|  53.9k|    const int32_t subpel_x_q4, int16_t *const im_block) {
  284|  53.9k|  const uint8_t *src_ptr = src - 3;
  285|  53.9k|  int32_t y = h;
  286|  53.9k|  int16_t *im = im_block;
  287|  53.9k|  __m256i coeffs_256[4], filt_256[4];
  288|       |
  289|  53.9k|  filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
  290|  53.9k|  filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
  291|  53.9k|  filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
  292|  53.9k|  filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx2);
  293|       |
  294|  53.9k|  prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
  295|       |
  296|  53.9k|  if (w == 8) {
  ------------------
  |  Branch (296:7): [True: 21.1k, False: 32.8k]
  ------------------
  297|   147k|    do {
  298|   147k|      const __m256i res =
  299|   147k|          x_convolve_8tap_8x2_avx2(src_ptr, src_stride, coeffs_256, filt_256);
  300|   147k|      xy_x_round_store_8x2_avx2(res, im);
  301|   147k|      src_ptr += 2 * src_stride;
  302|   147k|      im += 2 * 8;
  303|   147k|      y -= 2;
  304|   147k|    } while (y);
  ------------------
  |  Branch (304:14): [True: 126k, False: 21.1k]
  ------------------
  305|  32.8k|  } else if (w == 16) {
  ------------------
  |  Branch (305:14): [True: 11.0k, False: 21.7k]
  ------------------
  306|   100k|    do {
  307|   100k|      __m256i r[2];
  308|       |
  309|   100k|      x_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r);
  310|   100k|      xy_x_round_store_32_avx2(r, im);
  311|   100k|      src_ptr += 2 * src_stride;
  312|   100k|      im += 2 * 16;
  313|   100k|      y -= 2;
  314|   100k|    } while (y);
  ------------------
  |  Branch (314:14): [True: 89.2k, False: 11.0k]
  ------------------
  315|  21.7k|  } else if (w == 32) {
  ------------------
  |  Branch (315:14): [True: 14.5k, False: 7.14k]
  ------------------
  316|   379k|    do {
  317|   379k|      xy_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
  318|   379k|      src_ptr += src_stride;
  319|   379k|      im += 32;
  320|   379k|    } while (--y);
  ------------------
  |  Branch (320:14): [True: 364k, False: 14.5k]
  ------------------
  321|  14.5k|  } else if (w == 64) {
  ------------------
  |  Branch (321:14): [True: 6.87k, False: 264]
  ------------------
  322|   303k|    do {
  323|   303k|      xy_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
  324|   303k|      xy_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
  325|   303k|      src_ptr += src_stride;
  326|   303k|      im += 64;
  327|   303k|    } while (--y);
  ------------------
  |  Branch (327:14): [True: 296k, False: 6.87k]
  ------------------
  328|  6.87k|  } else {
  329|    264|    assert(w == 128);
  330|       |
  331|  30.9k|    do {
  332|  30.9k|      xy_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
  333|  30.9k|      xy_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
  334|  30.9k|      xy_x_8tap_32_avx2(src_ptr + 64, coeffs_256, filt_256, im + 64);
  335|  30.9k|      xy_x_8tap_32_avx2(src_ptr + 96, coeffs_256, filt_256, im + 96);
  336|  30.9k|      src_ptr += src_stride;
  337|  30.9k|      im += 128;
  338|  30.9k|    } while (--y);
  ------------------
  |  Branch (338:14): [True: 30.6k, False: 265]
  ------------------
  339|    265|  }
  340|  53.9k|}
convolve_2d_avx2.c:convolve_2d_sr_ver_2tap_half_avx2:
  485|  15.3k|    uint8_t *dst, const int32_t dst_stride) {
  486|  15.3k|  const int16_t *im = im_block;
  487|  15.3k|  int32_t y = h;
  488|       |
  489|  15.3k|  (void)filter_params_y;
  490|  15.3k|  (void)subpel_y_q4;
  491|       |
  492|  15.3k|  if (w == 2) {
  ------------------
  |  Branch (492:7): [True: 1.58k, False: 13.8k]
  ------------------
  493|  1.58k|    __m128i s_32[2];
  494|       |
  495|  1.58k|    s_32[0] = _mm_cvtsi32_si128(*(int32_t *)im);
  496|       |
  497|  3.03k|    do {
  498|  3.03k|      const __m128i res = xy_y_convolve_2tap_2x2_half_pel_sse2(im, s_32);
  499|  3.03k|      const __m128i r = xy_y_round_half_pel_sse2(res);
  500|  3.03k|      pack_store_2x2_sse2(r, dst, dst_stride);
  501|  3.03k|      im += 2 * 2;
  502|  3.03k|      dst += 2 * dst_stride;
  503|  3.03k|      y -= 2;
  504|  3.03k|    } while (y);
  ------------------
  |  Branch (504:14): [True: 1.45k, False: 1.58k]
  ------------------
  505|  13.8k|  } else if (w == 4) {
  ------------------
  |  Branch (505:14): [True: 5.04k, False: 8.76k]
  ------------------
  506|  5.04k|    __m128i s_64[2];
  507|       |
  508|  5.04k|    s_64[0] = _mm_loadl_epi64((__m128i *)im);
  509|       |
  510|  15.1k|    do {
  511|  15.1k|      const __m128i res = xy_y_convolve_2tap_4x2_half_pel_sse2(im, s_64);
  512|  15.1k|      const __m128i r = xy_y_round_half_pel_sse2(res);
  513|  15.1k|      pack_store_4x2_sse2(r, dst, dst_stride);
  514|  15.1k|      im += 2 * 4;
  515|  15.1k|      dst += 2 * dst_stride;
  516|  15.1k|      y -= 2;
  517|  15.1k|    } while (y);
  ------------------
  |  Branch (517:14): [True: 10.1k, False: 5.04k]
  ------------------
  518|  8.76k|  } else if (w == 8) {
  ------------------
  |  Branch (518:14): [True: 4.48k, False: 4.27k]
  ------------------
  519|  4.48k|    __m128i s_128[2];
  520|       |
  521|  4.48k|    s_128[0] = _mm_loadu_si128((__m128i *)im);
  522|       |
  523|  17.0k|    do {
  524|  17.0k|      const __m256i res = xy_y_convolve_2tap_8x2_half_pel_avx2(im, s_128);
  525|  17.0k|      const __m256i r = xy_y_round_half_pel_avx2(res);
  526|  17.0k|      pack_store_8x2_avx2(r, dst, dst_stride);
  527|  17.0k|      im += 2 * 8;
  528|  17.0k|      dst += 2 * dst_stride;
  529|  17.0k|      y -= 2;
  530|  17.0k|    } while (y);
  ------------------
  |  Branch (530:14): [True: 12.6k, False: 4.48k]
  ------------------
  531|  4.48k|  } else if (w == 16) {
  ------------------
  |  Branch (531:14): [True: 2.49k, False: 1.77k]
  ------------------
  532|  2.49k|    __m256i s_256[2], r[2];
  533|       |
  534|  2.49k|    s_256[0] = _mm256_loadu_si256((__m256i *)im);
  535|       |
  536|  16.4k|    do {
  537|  16.4k|      xy_y_convolve_2tap_16x2_half_pel_avx2(im, s_256, r);
  538|  16.4k|      r[0] = xy_y_round_half_pel_avx2(r[0]);
  539|  16.4k|      r[1] = xy_y_round_half_pel_avx2(r[1]);
  540|  16.4k|      xy_y_pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
  541|  16.4k|      im += 2 * 16;
  542|  16.4k|      dst += 2 * dst_stride;
  543|  16.4k|      y -= 2;
  544|  16.4k|    } while (y);
  ------------------
  |  Branch (544:14): [True: 13.9k, False: 2.49k]
  ------------------
  545|  2.49k|  } else if (w == 32) {
  ------------------
  |  Branch (545:14): [True: 1.26k, False: 514]
  ------------------
  546|  1.26k|    __m256i s_256[2][2];
  547|       |
  548|  1.26k|    s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
  549|  1.26k|    s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
  550|       |
  551|  16.2k|    do {
  552|  16.2k|      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 32, s_256[0], s_256[1], dst);
  553|  16.2k|      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 2 * 32, s_256[1], s_256[0],
  554|  16.2k|                                              dst + dst_stride);
  555|  16.2k|      im += 2 * 32;
  556|  16.2k|      dst += 2 * dst_stride;
  557|  16.2k|      y -= 2;
  558|  16.2k|    } while (y);
  ------------------
  |  Branch (558:14): [True: 14.9k, False: 1.26k]
  ------------------
  559|  1.26k|  } else if (w == 64) {
  ------------------
  |  Branch (559:14): [True: 314, False: 200]
  ------------------
  560|    314|    __m256i s_256[2][4];
  561|       |
  562|    314|    s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
  563|    314|    s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
  564|    314|    s_256[0][2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
  565|    314|    s_256[0][3] = _mm256_loadu_si256((__m256i *)(im + 3 * 16));
  566|       |
  567|  7.10k|    do {
  568|  7.10k|      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 64, s_256[0] + 0,
  569|  7.10k|                                              s_256[1] + 0, dst);
  570|  7.10k|      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 96, s_256[0] + 2,
  571|  7.10k|                                              s_256[1] + 2, dst + 32);
  572|  7.10k|      im += 2 * 64;
  573|  7.10k|      xy_y_convolve_2tap_half_pel_32_all_avx2(im, s_256[1] + 0, s_256[0] + 0,
  574|  7.10k|                                              dst + dst_stride);
  575|  7.10k|      xy_y_convolve_2tap_half_pel_32_all_avx2(
  576|  7.10k|          im + 32, s_256[1] + 2, s_256[0] + 2, dst + dst_stride + 32);
  577|  7.10k|      dst += 2 * dst_stride;
  578|  7.10k|      y -= 2;
  579|  7.10k|    } while (y);
  ------------------
  |  Branch (579:14): [True: 6.79k, False: 314]
  ------------------
  580|    314|  } else {
  581|    200|    __m256i s_256[2][8];
  582|       |
  583|    200|    assert(w == 128);
  584|       |
  585|    201|    load_16bit_8rows_avx2(im, 16, s_256[0]);
  586|       |
  587|  8.96k|    do {
  588|  8.96k|      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 128, s_256[0] + 0,
  589|  8.96k|                                              s_256[1] + 0, dst);
  590|  8.96k|      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 160, s_256[0] + 2,
  591|  8.96k|                                              s_256[1] + 2, dst + 1 * 32);
  592|  8.96k|      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 192, s_256[0] + 4,
  593|  8.96k|                                              s_256[1] + 4, dst + 2 * 32);
  594|  8.96k|      xy_y_convolve_2tap_half_pel_32_all_avx2(im + 224, s_256[0] + 6,
  595|  8.96k|                                              s_256[1] + 6, dst + 3 * 32);
  596|  8.96k|      im += 2 * 128;
  597|  8.96k|      xy_y_convolve_2tap_half_pel_32_all_avx2(im, s_256[1] + 0, s_256[0] + 0,
  598|  8.96k|                                              dst + dst_stride);
  599|  8.96k|      xy_y_convolve_2tap_half_pel_32_all_avx2(
  600|  8.96k|          im + 32, s_256[1] + 2, s_256[0] + 2, dst + dst_stride + 1 * 32);
  601|  8.96k|      xy_y_convolve_2tap_half_pel_32_all_avx2(
  602|  8.96k|          im + 64, s_256[1] + 4, s_256[0] + 4, dst + dst_stride + 2 * 32);
  603|  8.96k|      xy_y_convolve_2tap_half_pel_32_all_avx2(
  604|  8.96k|          im + 96, s_256[1] + 6, s_256[0] + 6, dst + dst_stride + 3 * 32);
  605|  8.96k|      dst += 2 * dst_stride;
  606|  8.96k|      y -= 2;
  607|  8.96k|    } while (y);
  ------------------
  |  Branch (607:14): [True: 8.75k, False: 201]
  ------------------
  608|    201|  }
  609|  15.3k|}
convolve_2d_avx2.c:convolve_2d_sr_ver_2tap_avx2:
  345|  54.8k|    uint8_t *dst, const int32_t dst_stride) {
  346|  54.8k|  const int16_t *im = im_block;
  347|  54.8k|  int32_t y = h;
  348|       |
  349|  54.8k|  if (w <= 4) {
  ------------------
  |  Branch (349:7): [True: 28.4k, False: 26.4k]
  ------------------
  350|  28.4k|    __m128i coeffs_128;
  351|       |
  352|  28.4k|    prepare_coeffs_2tap_sse2(filter_params_y, subpel_y_q4, &coeffs_128);
  353|       |
  354|  28.4k|    if (w == 2) {
  ------------------
  |  Branch (354:9): [True: 6.57k, False: 21.8k]
  ------------------
  355|  6.57k|      __m128i s_32[2];
  356|       |
  357|  6.57k|      s_32[0] = _mm_cvtsi32_si128(*(int32_t *)im);
  358|       |
  359|  13.4k|      do {
  360|  13.4k|        const __m128i res = xy_y_convolve_2tap_2x2_sse2(im, s_32, &coeffs_128);
  361|  13.4k|        xy_y_round_store_2x2_sse2(res, dst, dst_stride);
  362|  13.4k|        im += 2 * 2;
  363|  13.4k|        dst += 2 * dst_stride;
  364|  13.4k|        y -= 2;
  365|  13.4k|      } while (y);
  ------------------
  |  Branch (365:16): [True: 6.84k, False: 6.57k]
  ------------------
  366|  21.8k|    } else {
  367|  21.8k|      __m128i s_64[2], r[2];
  368|       |
  369|  21.8k|      assert(w == 4);
  370|       |
  371|  21.8k|      s_64[0] = _mm_loadl_epi64((__m128i *)im);
  372|       |
  373|  64.6k|      do {
  374|  64.6k|        xy_y_convolve_2tap_4x2_sse2(im, s_64, &coeffs_128, r);
  375|  64.6k|        r[0] = xy_y_round_sse2(r[0]);
  376|  64.6k|        r[1] = xy_y_round_sse2(r[1]);
  377|  64.6k|        const __m128i rr = _mm_packs_epi32(r[0], r[1]);
  378|  64.6k|        pack_store_4x2_sse2(rr, dst, dst_stride);
  379|  64.6k|        im += 2 * 4;
  380|  64.6k|        dst += 2 * dst_stride;
  381|  64.6k|        y -= 2;
  382|  64.6k|      } while (y);
  ------------------
  |  Branch (382:16): [True: 42.8k, False: 21.8k]
  ------------------
  383|  21.8k|    }
  384|  28.4k|  } else {
  385|  26.4k|    __m256i coeffs_256;
  386|       |
  387|  26.4k|    prepare_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, &coeffs_256);
  388|       |
  389|  26.4k|    if (w == 8) {
  ------------------
  |  Branch (389:9): [True: 16.2k, False: 10.1k]
  ------------------
  390|  16.2k|      __m128i s_128[2];
  391|  16.2k|      __m256i r[2];
  392|       |
  393|  16.2k|      s_128[0] = _mm_loadu_si128((__m128i *)im);
  394|       |
  395|  52.7k|      do {
  396|  52.7k|        xy_y_convolve_2tap_8x2_avx2(im, s_128, &coeffs_256, r);
  397|  52.7k|        xy_y_round_store_8x2_avx2(r, dst, dst_stride);
  398|  52.7k|        im += 2 * 8;
  399|  52.7k|        dst += 2 * dst_stride;
  400|  52.7k|        y -= 2;
  401|  52.7k|      } while (y);
  ------------------
  |  Branch (401:16): [True: 36.5k, False: 16.2k]
  ------------------
  402|  16.2k|    } else if (w == 16) {
  ------------------
  |  Branch (402:16): [True: 6.52k, False: 3.67k]
  ------------------
  403|  6.52k|      __m256i s_256[2], r[4];
  404|       |
  405|  6.52k|      s_256[0] = _mm256_loadu_si256((__m256i *)im);
  406|       |
  407|  31.2k|      do {
  408|  31.2k|        xy_y_convolve_2tap_16x2_avx2(im, s_256, &coeffs_256, r);
  409|  31.2k|        xy_y_round_store_16x2_avx2(r, dst, dst_stride);
  410|  31.2k|        im += 2 * 16;
  411|  31.2k|        dst += 2 * dst_stride;
  412|  31.2k|        y -= 2;
  413|  31.2k|      } while (y);
  ------------------
  |  Branch (413:16): [True: 24.7k, False: 6.52k]
  ------------------
  414|  6.52k|    } else if (w == 32) {
  ------------------
  |  Branch (414:16): [True: 2.04k, False: 1.62k]
  ------------------
  415|  2.04k|      __m256i s_256[2][2];
  416|       |
  417|  2.04k|      s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
  418|  2.04k|      s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
  419|       |
  420|  22.8k|      do {
  421|  22.8k|        xy_y_convolve_2tap_32_all_avx2(im + 32, s_256[0], s_256[1], &coeffs_256,
  422|  22.8k|                                       dst);
  423|  22.8k|        im += 2 * 32;
  424|  22.8k|        xy_y_convolve_2tap_32_all_avx2(im, s_256[1], s_256[0], &coeffs_256,
  425|  22.8k|                                       dst + dst_stride);
  426|  22.8k|        dst += 2 * dst_stride;
  427|  22.8k|        y -= 2;
  428|  22.8k|      } while (y);
  ------------------
  |  Branch (428:16): [True: 20.7k, False: 2.04k]
  ------------------
  429|  2.04k|    } else if (w == 64) {
  ------------------
  |  Branch (429:16): [True: 1.33k, False: 293]
  ------------------
  430|  1.33k|      __m256i s_256[2][4];
  431|       |
  432|  1.33k|      s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
  433|  1.33k|      s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
  434|  1.33k|      s_256[0][2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
  435|  1.33k|      s_256[0][3] = _mm256_loadu_si256((__m256i *)(im + 3 * 16));
  436|       |
  437|  30.1k|      do {
  438|  30.1k|        xy_y_convolve_2tap_32_all_avx2(im + 64, s_256[0] + 0, s_256[1] + 0,
  439|  30.1k|                                       &coeffs_256, dst);
  440|  30.1k|        xy_y_convolve_2tap_32_all_avx2(im + 96, s_256[0] + 2, s_256[1] + 2,
  441|  30.1k|                                       &coeffs_256, dst + 32);
  442|  30.1k|        im += 2 * 64;
  443|  30.1k|        xy_y_convolve_2tap_32_all_avx2(im, s_256[1] + 0, s_256[0] + 0,
  444|  30.1k|                                       &coeffs_256, dst + dst_stride);
  445|  30.1k|        xy_y_convolve_2tap_32_all_avx2(im + 32, s_256[1] + 2, s_256[0] + 2,
  446|  30.1k|                                       &coeffs_256, dst + dst_stride + 32);
  447|  30.1k|        dst += 2 * dst_stride;
  448|  30.1k|        y -= 2;
  449|  30.1k|      } while (y);
  ------------------
  |  Branch (449:16): [True: 28.7k, False: 1.33k]
  ------------------
  450|  1.33k|    } else {
  451|    293|      __m256i s_256[2][8];
  452|       |
  453|    293|      assert(w == 128);
  454|       |
  455|    293|      load_16bit_8rows_avx2(im, 16, s_256[0]);
  456|       |
  457|  14.3k|      do {
  458|  14.3k|        xy_y_convolve_2tap_32_all_avx2(im + 128, s_256[0] + 0, s_256[1] + 0,
  459|  14.3k|                                       &coeffs_256, dst);
  460|  14.3k|        xy_y_convolve_2tap_32_all_avx2(im + 160, s_256[0] + 2, s_256[1] + 2,
  461|  14.3k|                                       &coeffs_256, dst + 1 * 32);
  462|  14.3k|        xy_y_convolve_2tap_32_all_avx2(im + 192, s_256[0] + 4, s_256[1] + 4,
  463|  14.3k|                                       &coeffs_256, dst + 2 * 32);
  464|  14.3k|        xy_y_convolve_2tap_32_all_avx2(im + 224, s_256[0] + 6, s_256[1] + 6,
  465|  14.3k|                                       &coeffs_256, dst + 3 * 32);
  466|  14.3k|        im += 2 * 128;
  467|  14.3k|        xy_y_convolve_2tap_32_all_avx2(im, s_256[1] + 0, s_256[0] + 0,
  468|  14.3k|                                       &coeffs_256, dst + dst_stride);
  469|  14.3k|        xy_y_convolve_2tap_32_all_avx2(im + 32, s_256[1] + 2, s_256[0] + 2,
  470|  14.3k|                                       &coeffs_256, dst + dst_stride + 1 * 32);
  471|  14.3k|        xy_y_convolve_2tap_32_all_avx2(im + 64, s_256[1] + 4, s_256[0] + 4,
  472|  14.3k|                                       &coeffs_256, dst + dst_stride + 2 * 32);
  473|  14.3k|        xy_y_convolve_2tap_32_all_avx2(im + 96, s_256[1] + 6, s_256[0] + 6,
  474|  14.3k|                                       &coeffs_256, dst + dst_stride + 3 * 32);
  475|  14.3k|        dst += 2 * dst_stride;
  476|  14.3k|        y -= 2;
  477|  14.3k|      } while (y);
  ------------------
  |  Branch (477:16): [True: 14.0k, False: 293]
  ------------------
  478|    293|    }
  479|  26.4k|  }
  480|  54.8k|}
convolve_2d_avx2.c:convolve_2d_sr_ver_4tap_avx2:
  614|   879k|    uint8_t *dst, const int32_t dst_stride) {
  615|   879k|  const int16_t *im = im_block;
  616|   879k|  int32_t y = h;
  617|       |
  618|   879k|  if (w == 2) {
  ------------------
  |  Branch (618:7): [True: 86.6k, False: 792k]
  ------------------
  619|  86.6k|    __m128i coeffs_128[2], s_32[4], ss_128[2];
  620|       |
  621|  86.6k|    prepare_coeffs_4tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
  622|       |
  623|  86.6k|    s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
  624|  86.6k|    s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
  625|  86.6k|    s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
  626|       |
  627|  86.6k|    const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
  628|  86.6k|    const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
  629|       |
  630|  86.6k|    ss_128[0] = _mm_unpacklo_epi16(src01, src12);
  631|       |
  632|   146k|    do {
  633|   146k|      const __m128i res =
  634|   146k|          xy_y_convolve_4tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
  635|   146k|      xy_y_round_store_2x2_sse2(res, dst, dst_stride);
  636|   146k|      im += 2 * 2;
  637|   146k|      dst += 2 * dst_stride;
  638|   146k|      y -= 2;
  639|   146k|    } while (y);
  ------------------
  |  Branch (639:14): [True: 60.1k, False: 86.6k]
  ------------------
  640|   792k|  } else {
  641|   792k|    __m256i coeffs_256[2];
  642|       |
  643|   792k|    prepare_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
  644|       |
  645|   792k|    if (w == 4) {
  ------------------
  |  Branch (645:9): [True: 363k, False: 429k]
  ------------------
  646|   363k|      __m128i s_64[4];
  647|   363k|      __m256i s_256[2], ss_256[2];
  648|       |
  649|   363k|      s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
  650|   363k|      s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
  651|   363k|      s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
  652|       |
  653|       |      // Load lines a and b. Line a to lower 128, line b to upper 128
  654|   363k|      s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
  ------------------
  |  |   29|   363k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|   363k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
  655|   363k|      s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
  ------------------
  |  |   29|   363k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|   363k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
  656|       |
  657|   363k|      ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
  658|       |
  659|   726k|      do {
  660|   726k|        const __m256i res =
  661|   726k|            xy_y_convolve_4tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
  662|   726k|        xy_y_round_store_4x2_avx2(res, dst, dst_stride);
  663|   726k|        im += 2 * 4;
  664|   726k|        dst += 2 * dst_stride;
  665|   726k|        y -= 2;
  666|   726k|      } while (y);
  ------------------
  |  Branch (666:16): [True: 362k, False: 363k]
  ------------------
  667|   429k|    } else if (w == 8) {
  ------------------
  |  Branch (667:16): [True: 291k, False: 137k]
  ------------------
  668|   291k|      __m256i s_256[4], r[2];
  669|       |
  670|   291k|      s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
  671|   291k|      s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
  672|       |
  673|   291k|      if (subpel_y_q4 != 8) {
  ------------------
  |  Branch (673:11): [True: 241k, False: 50.0k]
  ------------------
  674|   241k|        __m256i ss_256[4];
  675|       |
  676|   241k|        ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
  677|   241k|        ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
  678|       |
  679|   482k|        do {
  680|   482k|          xy_y_convolve_4tap_8x2_avx2(im, ss_256, coeffs_256, r);
  681|   482k|          xy_y_round_store_8x2_avx2(r, dst, dst_stride);
  682|   482k|          im += 2 * 8;
  683|   482k|          dst += 2 * dst_stride;
  684|   482k|          y -= 2;
  685|   482k|        } while (y);
  ------------------
  |  Branch (685:18): [True: 240k, False: 241k]
  ------------------
  686|   241k|      } else {
  687|  88.0k|        do {
  688|  88.0k|          xy_y_convolve_4tap_8x2_half_pel_avx2(im, coeffs_256, s_256, r);
  689|  88.0k|          xy_y_round_store_8x2_avx2(r, dst, dst_stride);
  690|  88.0k|          im += 2 * 8;
  691|  88.0k|          dst += 2 * dst_stride;
  692|  88.0k|          y -= 2;
  693|  88.0k|        } while (y);
  ------------------
  |  Branch (693:18): [True: 37.9k, False: 50.0k]
  ------------------
  694|  50.0k|      }
  695|   291k|    } else if (w == 16) {
  ------------------
  |  Branch (695:16): [True: 126k, False: 10.6k]
  ------------------
  696|   126k|      __m256i s_256[5];
  697|       |
  698|   126k|      s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
  699|   126k|      s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
  700|   126k|      s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
  701|       |
  702|   126k|      if (subpel_y_q4 != 8) {
  ------------------
  |  Branch (702:11): [True: 102k, False: 24.5k]
  ------------------
  703|   102k|        __m256i ss_256[4], tt_256[4], r[4];
  704|       |
  705|   102k|        ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
  706|   102k|        ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
  707|       |
  708|   102k|        tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
  709|   102k|        tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
  710|       |
  711|   266k|        do {
  712|   266k|          xy_y_convolve_4tap_16x2_avx2(im, s_256, ss_256, tt_256, coeffs_256,
  713|   266k|                                       r);
  714|   266k|          xy_y_round_store_16x2_avx2(r, dst, dst_stride);
  715|   266k|          im += 2 * 16;
  716|   266k|          dst += 2 * dst_stride;
  717|   266k|          y -= 2;
  718|   266k|        } while (y);
  ------------------
  |  Branch (718:18): [True: 164k, False: 102k]
  ------------------
  719|   102k|      } else {
  720|  24.5k|        __m256i r[4];
  721|       |
  722|  49.0k|        do {
  723|  49.0k|          xy_y_convolve_4tap_16x2_half_pelavx2(im, s_256, coeffs_256, r);
  724|  49.0k|          xy_y_round_store_16x2_avx2(r, dst, dst_stride);
  725|  49.0k|          im += 2 * 16;
  726|  49.0k|          dst += 2 * dst_stride;
  727|  49.0k|          y -= 2;
  728|  49.0k|        } while (y);
  ------------------
  |  Branch (728:18): [True: 24.5k, False: 24.5k]
  ------------------
  729|  24.5k|      }
  730|   126k|    } else {
  731|       |      /*It's a special condition for OBMC. A/c  to Av1 spec 4-tap won't
  732|       |      support for width(w)>16, but for OBMC while predicting above block
  733|       |      it reduces size block to Wx(h/2), for example, if above block size
  734|       |      is 32x8, we get block size as 32x4 for OBMC.*/
  735|  10.6k|      int32_t x = 0;
  736|       |
  737|  10.6k|      assert(!(w % 32));
  738|       |
  739|  10.7k|      __m256i s_256[2][4], ss_256[2][4], tt_256[2][4], r0[4], r1[4];
  740|  13.5k|      do {
  741|  13.5k|        const int16_t *s = im + x;
  742|  13.5k|        uint8_t *d = dst + x;
  743|       |
  744|  13.5k|        loadu_unpack_16bit_3rows_avx2(s, w, s_256[0], ss_256[0], tt_256[0]);
  745|  13.5k|        loadu_unpack_16bit_3rows_avx2(s + 16, w, s_256[1], ss_256[1],
  746|  13.5k|                                      tt_256[1]);
  747|       |
  748|  13.5k|        y = h;
  749|   222k|        do {
  750|   222k|          xy_y_convolve_4tap_32x2_avx2(s, w, s_256[0], ss_256[0], tt_256[0],
  751|   222k|                                       coeffs_256, r0);
  752|   222k|          xy_y_convolve_4tap_32x2_avx2(s + 16, w, s_256[1], ss_256[1],
  753|   222k|                                       tt_256[1], coeffs_256, r1);
  754|       |
  755|   222k|          xy_y_round_store_32_avx2(r0 + 0, r1 + 0, d);
  756|   222k|          xy_y_round_store_32_avx2(r0 + 2, r1 + 2, d + dst_stride);
  757|       |
  758|   222k|          s += 2 * w;
  759|   222k|          d += 2 * dst_stride;
  760|   222k|          y -= 2;
  761|   222k|        } while (y);
  ------------------
  |  Branch (761:18): [True: 209k, False: 13.5k]
  ------------------
  762|       |
  763|  13.5k|        x += 32;
  764|  13.5k|      } while (x < w);
  ------------------
  |  Branch (764:16): [True: 2.86k, False: 10.7k]
  ------------------
  765|  10.7k|    }
  766|   792k|  }
  767|   879k|}
convolve_2d_avx2.c:convolve_2d_sr_ver_6tap_avx2:
  772|   818k|    uint8_t *dst, const int32_t dst_stride) {
  773|   818k|  const int16_t *im = im_block;
  774|   818k|  int32_t y;
  775|       |
  776|   818k|  if (w == 2) {
  ------------------
  |  Branch (776:7): [True: 59.7k, False: 758k]
  ------------------
  777|  59.7k|    __m128i coeffs_128[3], s_32[6], ss_128[3];
  778|       |
  779|  59.7k|    prepare_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
  780|       |
  781|  59.7k|    s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
  782|  59.7k|    s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
  783|  59.7k|    s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
  784|  59.7k|    s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2));
  785|  59.7k|    s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2));
  786|       |
  787|  59.7k|    const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
  788|  59.7k|    const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
  789|  59.7k|    const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
  790|  59.7k|    const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
  791|       |
  792|  59.7k|    ss_128[0] = _mm_unpacklo_epi16(src01, src12);
  793|  59.7k|    ss_128[1] = _mm_unpacklo_epi16(src23, src34);
  794|       |
  795|  59.7k|    y = h;
  796|   239k|    do {
  797|   239k|      const __m128i res =
  798|   239k|          xy_y_convolve_6tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
  799|   239k|      xy_y_round_store_2x2_sse2(res, dst, dst_stride);
  800|   239k|      im += 2 * 2;
  801|   239k|      dst += 2 * dst_stride;
  802|   239k|      y -= 2;
  803|   239k|    } while (y);
  ------------------
  |  Branch (803:14): [True: 179k, False: 59.7k]
  ------------------
  804|   758k|  } else {
  805|   758k|    __m256i coeffs_256[3];
  806|       |
  807|   758k|    prepare_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
  808|       |
  809|   758k|    if (w == 4) {
  ------------------
  |  Branch (809:9): [True: 224k, False: 534k]
  ------------------
  810|   224k|      __m128i s_64[6];
  811|   224k|      __m256i s_256[6], ss_256[3];
  812|       |
  813|   224k|      s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
  814|   224k|      s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
  815|   224k|      s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
  816|   224k|      s_64[3] = _mm_loadl_epi64((__m128i *)(im + 3 * 4));
  817|   224k|      s_64[4] = _mm_loadl_epi64((__m128i *)(im + 4 * 4));
  818|       |
  819|       |      // Load lines a and b. Line a to lower 128, line b to upper 128
  820|   224k|      s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
  ------------------
  |  |   29|   224k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|   224k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
  821|   224k|      s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
  ------------------
  |  |   29|   224k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|   224k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
  822|   224k|      s_256[2] = _mm256_setr_m128i(s_64[2], s_64[3]);
  ------------------
  |  |   29|   224k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|   224k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
  823|   224k|      s_256[3] = _mm256_setr_m128i(s_64[3], s_64[4]);
  ------------------
  |  |   29|   224k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|   224k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
  824|       |
  825|   224k|      ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
  826|   224k|      ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
  827|       |
  828|   224k|      y = h;
  829|  1.20M|      do {
  830|  1.20M|        const __m256i res =
  831|  1.20M|            xy_y_convolve_6tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
  832|  1.20M|        xy_y_round_store_4x2_avx2(res, dst, dst_stride);
  833|  1.20M|        im += 2 * 4;
  834|  1.20M|        dst += 2 * dst_stride;
  835|  1.20M|        y -= 2;
  836|  1.20M|      } while (y);
  ------------------
  |  Branch (836:16): [True: 977k, False: 224k]
  ------------------
  837|   534k|    } else if (w == 8) {
  ------------------
  |  Branch (837:16): [True: 280k, False: 253k]
  ------------------
  838|   280k|      __m256i s_256[6], r[2];
  839|       |
  840|   280k|      s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
  841|   280k|      s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
  842|   280k|      s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 8));
  843|   280k|      s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 8));
  844|   280k|      y = h;
  845|       |
  846|   280k|      if (subpel_y_q4 != 8) {
  ------------------
  |  Branch (846:11): [True: 219k, False: 60.8k]
  ------------------
  847|   219k|        __m256i ss_256[6];
  848|       |
  849|   219k|        ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
  850|   219k|        ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
  851|       |
  852|   219k|        ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
  853|   219k|        ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
  854|       |
  855|  1.23M|        do {
  856|  1.23M|          xy_y_convolve_6tap_8x2_avx2(im, ss_256, coeffs_256, r);
  857|  1.23M|          xy_y_round_store_8x2_avx2(r, dst, dst_stride);
  858|  1.23M|          im += 2 * 8;
  859|  1.23M|          dst += 2 * dst_stride;
  860|  1.23M|          y -= 2;
  861|  1.23M|        } while (y);
  ------------------
  |  Branch (861:18): [True: 1.01M, False: 219k]
  ------------------
  862|   219k|      } else {
  863|   358k|        do {
  864|   358k|          xy_y_convolve_6tap_8x2_half_pel_avx2(im, coeffs_256, s_256, r);
  865|   358k|          xy_y_round_store_8x2_avx2(r, dst, dst_stride);
  866|   358k|          im += 2 * 8;
  867|   358k|          dst += 2 * dst_stride;
  868|   358k|          y -= 2;
  869|   358k|        } while (y);
  ------------------
  |  Branch (869:18): [True: 297k, False: 60.8k]
  ------------------
  870|  60.8k|      }
  871|   280k|    } else if (w == 16) {
  ------------------
  |  Branch (871:16): [True: 178k, False: 75.8k]
  ------------------
  872|   178k|      __m256i s_256[6];
  873|       |
  874|   178k|      s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
  875|   178k|      s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
  876|   178k|      s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
  877|   178k|      s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 16));
  878|   178k|      s_256[4] = _mm256_loadu_si256((__m256i *)(im + 4 * 16));
  879|   178k|      y = h;
  880|       |
  881|   178k|      if (subpel_y_q4 != 8) {
  ------------------
  |  Branch (881:11): [True: 135k, False: 43.0k]
  ------------------
  882|   135k|        __m256i ss_256[6], tt_256[6], r[4];
  883|       |
  884|   135k|        ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
  885|   135k|        ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
  886|   135k|        ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
  887|   135k|        ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
  888|       |
  889|   135k|        tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
  890|   135k|        tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
  891|   135k|        tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
  892|   135k|        tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
  893|       |
  894|   971k|        do {
  895|   971k|          xy_y_convolve_6tap_16x2_avx2(im, 16, s_256, ss_256, tt_256,
  896|   971k|                                       coeffs_256, r);
  897|   971k|          xy_y_round_store_16x2_avx2(r, dst, dst_stride);
  898|   971k|          im += 2 * 16;
  899|   971k|          dst += 2 * dst_stride;
  900|   971k|          y -= 2;
  901|   971k|        } while (y);
  ------------------
  |  Branch (901:18): [True: 836k, False: 135k]
  ------------------
  902|   135k|      } else {
  903|  43.0k|        __m256i ss_256[4], r[4];
  904|       |
  905|   321k|        do {
  906|   321k|          xy_y_convolve_6tap_16x2_half_pel_avx2(im, 16, s_256, ss_256,
  907|   321k|                                                coeffs_256, r);
  908|   321k|          xy_y_round_store_16x2_avx2(r, dst, dst_stride);
  909|       |
  910|   321k|          im += 2 * 16;
  911|   321k|          dst += 2 * dst_stride;
  912|   321k|          y -= 2;
  913|   321k|        } while (y);
  ------------------
  |  Branch (913:18): [True: 278k, False: 43.0k]
  ------------------
  914|  43.0k|      }
  915|   178k|    } else {
  916|  75.8k|      int32_t x = 0;
  917|       |
  918|  75.8k|      assert(!(w % 32));
  919|       |
  920|  75.9k|      __m256i s_256[2][6], ss_256[2][6], tt_256[2][6], r0[4], r1[4];
  921|       |
  922|  93.6k|      do {
  923|  93.6k|        const int16_t *s = im + x;
  924|  93.6k|        uint8_t *d = dst + x;
  925|       |
  926|  93.6k|        loadu_unpack_16bit_5rows_avx2(s, w, s_256[0], ss_256[0], tt_256[0]);
  927|  93.6k|        loadu_unpack_16bit_5rows_avx2(s + 16, w, s_256[1], ss_256[1],
  928|  93.6k|                                      tt_256[1]);
  929|       |
  930|  93.6k|        y = h;
  931|  1.71M|        do {
  932|  1.71M|          xy_y_convolve_6tap_16x2_avx2(s, w, s_256[0], ss_256[0], tt_256[0],
  933|  1.71M|                                       coeffs_256, r0);
  934|  1.71M|          xy_y_convolve_6tap_16x2_avx2(s + 16, w, s_256[1], ss_256[1],
  935|  1.71M|                                       tt_256[1], coeffs_256, r1);
  936|       |
  937|  1.71M|          xy_y_round_store_32_avx2(r0 + 0, r1 + 0, d);
  938|  1.71M|          xy_y_round_store_32_avx2(r0 + 2, r1 + 2, d + dst_stride);
  939|       |
  940|  1.71M|          s += 2 * w;
  941|  1.71M|          d += 2 * dst_stride;
  942|  1.71M|          y -= 2;
  943|  1.71M|        } while (y);
  ------------------
  |  Branch (943:18): [True: 1.61M, False: 93.6k]
  ------------------
  944|       |
  945|  93.6k|        x += 32;
  946|  93.6k|      } while (x < w);
  ------------------
  |  Branch (946:16): [True: 17.6k, False: 75.9k]
  ------------------
  947|  75.9k|    }
  948|   758k|  }
  949|   818k|}
convolve_2d_avx2.c:convolve_2d_sr_ver_8tap_avx2:
  954|  52.2k|    uint8_t *dst, const int32_t dst_stride) {
  955|  52.2k|  const int16_t *im = im_block;
  956|  52.2k|  int32_t y;
  957|       |
  958|  52.2k|  if (w == 2) {
  ------------------
  |  Branch (958:7): [True: 2.74k, False: 49.4k]
  ------------------
  959|  2.74k|    __m128i coeffs_128[4], s_32[8], ss_128[4];
  960|       |
  961|  2.74k|    prepare_coeffs_8tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
  962|       |
  963|  2.74k|    s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
  964|  2.74k|    s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
  965|  2.74k|    s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
  966|  2.74k|    s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2));
  967|  2.74k|    s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2));
  968|  2.74k|    s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(im + 5 * 2));
  969|  2.74k|    s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(im + 6 * 2));
  970|       |
  971|  2.74k|    const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
  972|  2.74k|    const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
  973|  2.74k|    const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
  974|  2.74k|    const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
  975|  2.74k|    const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
  976|  2.74k|    const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
  977|       |
  978|  2.74k|    ss_128[0] = _mm_unpacklo_epi16(src01, src12);
  979|  2.74k|    ss_128[1] = _mm_unpacklo_epi16(src23, src34);
  980|  2.74k|    ss_128[2] = _mm_unpacklo_epi16(src45, src56);
  981|       |
  982|  2.74k|    y = h;
  983|  10.9k|    do {
  984|  10.9k|      const __m128i res =
  985|  10.9k|          xy_y_convolve_8tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
  986|  10.9k|      xy_y_round_store_2x2_sse2(res, dst, dst_stride);
  987|  10.9k|      im += 2 * 2;
  988|  10.9k|      dst += 2 * dst_stride;
  989|  10.9k|      y -= 2;
  990|  10.9k|    } while (y);
  ------------------
  |  Branch (990:14): [True: 8.24k, False: 2.74k]
  ------------------
  991|  49.4k|  } else {
  992|  49.4k|    __m256i coeffs_256[4];
  993|       |
  994|  49.4k|    prepare_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
  995|       |
  996|  49.4k|    if (w == 4) {
  ------------------
  |  Branch (996:9): [True: 9.97k, False: 39.4k]
  ------------------
  997|  9.97k|      __m128i s_64[8];
  998|  9.97k|      __m256i s_256[8], ss_256[4];
  999|       |
 1000|  9.97k|      s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
 1001|  9.97k|      s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
 1002|  9.97k|      s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
 1003|  9.97k|      s_64[3] = _mm_loadl_epi64((__m128i *)(im + 3 * 4));
 1004|  9.97k|      s_64[4] = _mm_loadl_epi64((__m128i *)(im + 4 * 4));
 1005|  9.97k|      s_64[5] = _mm_loadl_epi64((__m128i *)(im + 5 * 4));
 1006|  9.97k|      s_64[6] = _mm_loadl_epi64((__m128i *)(im + 6 * 4));
 1007|       |
 1008|       |      // Load lines a and b. Line a to lower 128, line b to upper 128
 1009|  9.97k|      s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
  ------------------
  |  |   29|  9.97k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  9.97k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1010|  9.97k|      s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
  ------------------
  |  |   29|  9.97k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  9.97k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1011|  9.97k|      s_256[2] = _mm256_setr_m128i(s_64[2], s_64[3]);
  ------------------
  |  |   29|  9.97k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  9.97k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1012|  9.97k|      s_256[3] = _mm256_setr_m128i(s_64[3], s_64[4]);
  ------------------
  |  |   29|  9.97k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  9.97k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1013|  9.97k|      s_256[4] = _mm256_setr_m128i(s_64[4], s_64[5]);
  ------------------
  |  |   29|  9.97k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  9.97k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1014|  9.97k|      s_256[5] = _mm256_setr_m128i(s_64[5], s_64[6]);
  ------------------
  |  |   29|  9.97k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  9.97k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1015|       |
 1016|  9.97k|      ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
 1017|  9.97k|      ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
 1018|  9.97k|      ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
 1019|       |
 1020|  9.97k|      y = h;
 1021|  52.6k|      do {
 1022|  52.6k|        const __m256i res =
 1023|  52.6k|            xy_y_convolve_8tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
 1024|  52.6k|        xy_y_round_store_4x2_avx2(res, dst, dst_stride);
 1025|  52.6k|        im += 2 * 4;
 1026|  52.6k|        dst += 2 * dst_stride;
 1027|  52.6k|        y -= 2;
 1028|  52.6k|      } while (y);
  ------------------
  |  Branch (1028:16): [True: 42.6k, False: 9.97k]
  ------------------
 1029|  39.4k|    } else if (w == 8) {
  ------------------
  |  Branch (1029:16): [True: 11.0k, False: 28.4k]
  ------------------
 1030|  11.0k|      __m256i s_256[8], r[2];
 1031|       |
 1032|  11.0k|      s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
 1033|  11.0k|      s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
 1034|  11.0k|      s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 8));
 1035|  11.0k|      s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 8));
 1036|  11.0k|      s_256[4] = _mm256_loadu_si256((__m256i *)(im + 4 * 8));
 1037|  11.0k|      s_256[5] = _mm256_loadu_si256((__m256i *)(im + 5 * 8));
 1038|  11.0k|      y = h;
 1039|       |
 1040|  11.0k|      if (subpel_y_q4 != 8) {
  ------------------
  |  Branch (1040:11): [True: 7.60k, False: 3.39k]
  ------------------
 1041|  7.60k|        __m256i ss_256[8];
 1042|       |
 1043|  7.60k|        convolve_8tap_unpack_avx2(s_256, ss_256);
 1044|       |
 1045|  43.1k|        do {
 1046|  43.1k|          xy_y_convolve_8tap_8x2_avx2(im, ss_256, coeffs_256, r);
 1047|  43.1k|          xy_y_round_store_8x2_avx2(r, dst, dst_stride);
 1048|  43.1k|          im += 2 * 8;
 1049|  43.1k|          dst += 2 * dst_stride;
 1050|  43.1k|          y -= 2;
 1051|  43.1k|        } while (y);
  ------------------
  |  Branch (1051:18): [True: 35.5k, False: 7.60k]
  ------------------
 1052|  7.60k|      } else {
 1053|  22.0k|        do {
 1054|  22.0k|          xy_y_convolve_8tap_8x2_half_pel_avx2(im, coeffs_256, s_256, r);
 1055|  22.0k|          xy_y_round_store_8x2_avx2(r, dst, dst_stride);
 1056|  22.0k|          im += 2 * 8;
 1057|  22.0k|          dst += 2 * dst_stride;
 1058|  22.0k|          y -= 2;
 1059|  22.0k|        } while (y);
  ------------------
  |  Branch (1059:18): [True: 18.6k, False: 3.39k]
  ------------------
 1060|  3.39k|      }
 1061|  28.4k|    } else if (w == 16) {
  ------------------
  |  Branch (1061:16): [True: 6.92k, False: 21.5k]
  ------------------
 1062|  6.92k|      __m256i s_256[8], r[4];
 1063|       |
 1064|  6.92k|      load_16bit_7rows_avx2(im, 16, s_256);
 1065|  6.92k|      y = h;
 1066|       |
 1067|  6.92k|      if (subpel_y_q4 != 8) {
  ------------------
  |  Branch (1067:11): [True: 4.67k, False: 2.24k]
  ------------------
 1068|  4.67k|        __m256i ss_256[8], tt_256[8];
 1069|       |
 1070|  4.67k|        convolve_8tap_unpack_avx2(s_256, ss_256);
 1071|  4.67k|        convolve_8tap_unpack_avx2(s_256 + 1, tt_256);
 1072|       |
 1073|  37.9k|        do {
 1074|  37.9k|          xy_y_convolve_8tap_16x2_avx2(im, 16, coeffs_256, s_256, ss_256,
 1075|  37.9k|                                       tt_256, r);
 1076|  37.9k|          xy_y_round_store_16x2_avx2(r, dst, dst_stride);
 1077|       |
 1078|  37.9k|          im += 2 * 16;
 1079|  37.9k|          dst += 2 * dst_stride;
 1080|  37.9k|          y -= 2;
 1081|  37.9k|        } while (y);
  ------------------
  |  Branch (1081:18): [True: 33.3k, False: 4.67k]
  ------------------
 1082|  4.67k|      } else {
 1083|  17.4k|        do {
 1084|  17.4k|          xy_y_convolve_8tap_16x2_half_pel_avx2(im, 16, coeffs_256, s_256, r);
 1085|  17.4k|          xy_y_round_store_16x2_avx2(r, dst, dst_stride);
 1086|       |
 1087|  17.4k|          im += 2 * 16;
 1088|  17.4k|          dst += 2 * dst_stride;
 1089|  17.4k|          y -= 2;
 1090|  17.4k|        } while (y);
  ------------------
  |  Branch (1090:18): [True: 15.2k, False: 2.24k]
  ------------------
 1091|  2.24k|      }
 1092|  21.5k|    } else {
 1093|  21.5k|      int32_t x = 0;
 1094|  21.5k|      __m256i s_256[2][8], r0[4], r1[4];
 1095|       |
 1096|  21.5k|      assert(!(w % 32));
 1097|       |
 1098|  21.5k|      __m256i ss_256[2][8], tt_256[2][8];
 1099|       |
 1100|  29.2k|      do {
 1101|  29.2k|        const int16_t *s = im + x;
 1102|  29.2k|        uint8_t *d = dst + x;
 1103|       |
 1104|  29.2k|        load_16bit_7rows_avx2(s, w, s_256[0]);
 1105|  29.2k|        convolve_8tap_unpack_avx2(s_256[0], ss_256[0]);
 1106|  29.2k|        convolve_8tap_unpack_avx2(s_256[0] + 1, tt_256[0]);
 1107|       |
 1108|  29.2k|        load_16bit_7rows_avx2(s + 16, w, s_256[1]);
 1109|  29.2k|        convolve_8tap_unpack_avx2(s_256[1], ss_256[1]);
 1110|  29.2k|        convolve_8tap_unpack_avx2(s_256[1] + 1, tt_256[1]);
 1111|       |
 1112|  29.2k|        y = h;
 1113|   436k|        do {
 1114|   436k|          xy_y_convolve_8tap_16x2_avx2(s, w, coeffs_256, s_256[0], ss_256[0],
 1115|   436k|                                       tt_256[0], r0);
 1116|   436k|          xy_y_convolve_8tap_16x2_avx2(s + 16, w, coeffs_256, s_256[1],
 1117|   436k|                                       ss_256[1], tt_256[1], r1);
 1118|   436k|          xy_y_round_store_32_avx2(r0 + 0, r1 + 0, d);
 1119|   436k|          xy_y_round_store_32_avx2(r0 + 2, r1 + 2, d + dst_stride);
 1120|       |
 1121|   436k|          s += 2 * w;
 1122|   436k|          d += 2 * dst_stride;
 1123|   436k|          y -= 2;
 1124|   436k|        } while (y);
  ------------------
  |  Branch (1124:18): [True: 407k, False: 29.2k]
  ------------------
 1125|       |
 1126|  29.2k|        x += 32;
 1127|  29.2k|      } while (x < w);
  ------------------
  |  Branch (1127:16): [True: 7.67k, False: 21.5k]
  ------------------
 1128|  21.5k|    }
 1129|  49.4k|  }
 1130|  52.2k|}

convolve_2d_avx2.c:prepare_half_coeffs_2tap_ssse3:
   61|  55.7k|    __m128i *const coeffs /* [1] */) {
   62|  55.7k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
   63|  55.7k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  55.7k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  55.7k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
   64|  55.7k|  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
   65|       |
   66|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
   67|       |  // This extra right shift will be taken care of at the end while rounding
   68|       |  // the result.
   69|       |  // Since all filter co-efficients are even, this change will not affect the
   70|       |  // end result
   71|  55.7k|  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
   72|  55.7k|                            _mm_set1_epi16((short)0xffff)));
   73|       |
   74|  55.7k|  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
   75|       |
   76|       |  // coeffs 3 4 3 4 3 4 3 4
   77|  55.7k|  *coeffs = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
   78|  55.7k|}
convolve_2d_avx2.c:x_convolve_2tap_2x2_sse4_1:
  859|  24.6k|                                                 const __m128i coeffs[1]) {
  860|  24.6k|  const __m128i sfl =
  861|  24.6k|      _mm_setr_epi8(0, 1, 1, 2, 4, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0);
  862|  24.6k|  const __m128i s_128 = load_u8_4x2_sse4_1(src, stride);
  863|  24.6k|  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
  864|  24.6k|  return convolve_2tap_ssse3(&ss, coeffs);
  865|  24.6k|}
convolve_2d_avx2.c:convolve_2tap_ssse3:
  433|   312k|                                          const __m128i coeffs[1]) {
  434|   312k|  return _mm_maddubs_epi16(ss[0], coeffs[0]);
  435|   312k|}
convolve_2d_avx2.c:xy_x_round_store_2x2_sse2:
  615|   784k|                                             int16_t *const dst) {
  616|   784k|  const __m128i d = xy_x_round_sse2(res);
  617|   784k|  _mm_storel_epi64((__m128i *)dst, d);
  618|   784k|}
convolve_2d_avx2.c:xy_x_round_sse2:
  602|  4.49M|static inline __m128i xy_x_round_sse2(const __m128i src) {
  603|  4.49M|  const __m128i round = _mm_set1_epi16(2);
  604|  4.49M|  const __m128i dst = _mm_add_epi16(src, round);
  605|  4.49M|  return _mm_srai_epi16(dst, 2);
  606|  4.49M|}
convolve_2d_avx2.c:x_convolve_2tap_4x2_ssse3:
  869|   106k|                                                const __m128i coeffs[1]) {
  870|   106k|  const __m128i sfl =
  871|   106k|      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
  872|   106k|  const __m128i s_128 = load_u8_8x2_sse2(src, stride);
  873|   106k|  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
  874|   106k|  return convolve_2tap_ssse3(&ss, coeffs);
  875|   106k|}
convolve_2d_avx2.c:xy_x_round_store_4x2_sse2:
  621|  3.52M|                                             int16_t *const dst) {
  622|  3.52M|  const __m128i d = xy_x_round_sse2(res);
  623|  3.52M|  _mm_storeu_si128((__m128i *)dst, d);
  624|  3.52M|}
convolve_2d_avx2.c:x_convolve_2tap_8x2_ssse3:
  880|  90.6k|                                             __m128i r[2]) {
  881|  90.6k|  __m128i ss[2];
  882|  90.6k|  const __m128i s00 = _mm_loadu_si128((__m128i *)src);
  883|  90.6k|  const __m128i s10 = _mm_loadu_si128((__m128i *)(src + stride));
  884|  90.6k|  const __m128i s01 = _mm_srli_si128(s00, 1);
  885|  90.6k|  const __m128i s11 = _mm_srli_si128(s10, 1);
  886|  90.6k|  ss[0] = _mm_unpacklo_epi8(s00, s01);
  887|  90.6k|  ss[1] = _mm_unpacklo_epi8(s10, s11);
  888|       |
  889|  90.6k|  r[0] = convolve_2tap_ssse3(&ss[0], coeffs);
  890|  90.6k|  r[1] = convolve_2tap_ssse3(&ss[1], coeffs);
  891|  90.6k|}
convolve_2d_avx2.c:xy_x_round_store_8x2_sse2:
  627|  90.6k|                                             int16_t *const dst) {
  628|  90.6k|  __m128i r[2];
  629|       |
  630|  90.6k|  r[0] = xy_x_round_sse2(res[0]);
  631|  90.6k|  r[1] = xy_x_round_sse2(res[1]);
  632|  90.6k|  _mm_storeu_si128((__m128i *)dst, r[0]);
  633|  90.6k|  _mm_storeu_si128((__m128i *)(dst + 8), r[1]);
  634|  90.6k|}
convolve_2d_avx2.c:prepare_half_coeffs_2tap_avx2:
  157|  14.4k|    __m256i *const coeffs /* [1] */) {
  158|  14.4k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  159|  14.4k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  14.4k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  14.4k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  160|  14.4k|  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
  161|  14.4k|  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
  162|       |
  163|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
  164|       |  // This extra right shift will be taken care of at the end while rounding
  165|       |  // the result.
  166|       |  // Since all filter co-efficients are even, this change will not affect the
  167|       |  // end result
  168|  14.4k|  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
  169|  14.4k|                            _mm_set1_epi16((short)0xffff)));
  170|       |
  171|  14.4k|  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
  172|       |
  173|       |  // coeffs 3 4 3 4 3 4 3 4
  174|  14.4k|  *coeffs = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
  175|  14.4k|}
convolve_2d_avx2.c:x_convolve_2tap_16x2_avx2:
  912|  56.7k|                                             __m256i r[2]) {
  913|  56.7k|  const __m256i s0_256 = loadu_8bit_16x2_avx2(src, stride);
  914|  56.7k|  const __m256i s1_256 = loadu_8bit_16x2_avx2(src + 1, stride);
  915|  56.7k|  const __m256i s0 = _mm256_unpacklo_epi8(s0_256, s1_256);
  916|  56.7k|  const __m256i s1 = _mm256_unpackhi_epi8(s0_256, s1_256);
  917|  56.7k|  r[0] = convolve_2tap_avx2(&s0, coeffs);
  918|  56.7k|  r[1] = convolve_2tap_avx2(&s1, coeffs);
  919|  56.7k|}
convolve_2d_avx2.c:convolve_2tap_avx2:
  465|   973k|                                         const __m256i coeffs[1]) {
  466|   973k|  return _mm256_maddubs_epi16(ss[0], coeffs[0]);
  467|   973k|}
convolve_2d_avx2.c:xy_x_round_store_32_avx2:
  643|  2.53M|                                            int16_t *const dst) {
  644|  2.53M|  __m256i r[2];
  645|       |
  646|  2.53M|  r[0] = xy_x_round_avx2(res[0]);
  647|  2.53M|  r[1] = xy_x_round_avx2(res[1]);
  648|  2.53M|  const __m256i d0 =
  649|  2.53M|      _mm256_inserti128_si256(r[0], _mm256_castsi256_si128(r[1]), 1);
  650|  2.53M|  const __m256i d1 =
  651|  2.53M|      _mm256_inserti128_si256(r[1], _mm256_extracti128_si256(r[0], 1), 0);
  652|  2.53M|  _mm256_storeu_si256((__m256i *)dst, d0);
  653|  2.53M|  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
  654|  2.53M|}
convolve_2d_avx2.c:xy_x_round_avx2:
  608|  20.7M|static inline __m256i xy_x_round_avx2(const __m256i src) {
  609|  20.7M|  const __m256i round = _mm256_set1_epi16(2);
  610|  20.7M|  const __m256i dst = _mm256_add_epi16(src, round);
  611|  20.7M|  return _mm256_srai_epi16(dst, 2);
  612|  20.7M|}
convolve_2d_avx2.c:xy_x_2tap_32_avx2:
 1374|   430k|                                     int16_t *const dst) {
 1375|   430k|  __m256i r[2];
 1376|       |
 1377|   430k|  xy_x_convolve_2tap_32_avx2(src, coeffs, r);
 1378|   430k|  const __m256i d0 = xy_x_round_avx2(r[0]);
 1379|   430k|  const __m256i d1 = xy_x_round_avx2(r[1]);
 1380|   430k|  _mm256_storeu_si256((__m256i *)dst, d0);
 1381|   430k|  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
 1382|   430k|}
convolve_2d_avx2.c:xy_x_convolve_2tap_32_avx2:
 1362|   430k|                                              __m256i r[2]) {
 1363|   430k|  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
 1364|   430k|  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
 1365|   430k|  const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
 1366|   430k|  const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
 1367|       |
 1368|   430k|  r[0] = convolve_2tap_avx2(&ss0, coeffs);
 1369|   430k|  r[1] = convolve_2tap_avx2(&ss1, coeffs);
 1370|   430k|}
convolve_2d_avx2.c:prepare_half_coeffs_4tap_ssse3:
   82|   747k|    __m128i *const coeffs /* [2] */) {
   83|   747k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
   84|   747k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|   747k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|   747k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
   85|   747k|  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
   86|       |
   87|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
   88|       |  // This extra right shift will be taken care of at the end while rounding
   89|       |  // the result.
   90|       |  // Since all filter co-efficients are even, this change will not affect the
   91|       |  // end result
   92|   747k|  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
   93|   747k|                            _mm_set1_epi16((short)0xffff)));
   94|       |
   95|   747k|  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
   96|       |
   97|       |  // coeffs 2 3 2 3 2 3 2 3
   98|   747k|  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
   99|       |  // coeffs 4 5 4 5 4 5 4 5
  100|   747k|  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
  101|   747k|}
convolve_2d_avx2.c:x_convolve_4tap_2x2_ssse3:
  935|   760k|                                                const __m128i coeffs[2]) {
  936|   760k|  const __m128i sfl0 =
  937|   760k|      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
  938|   760k|  const __m128i sfl1 =
  939|   760k|      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
  940|   760k|  const __m128i s = load_u8_8x2_sse2(src, stride);
  941|   760k|  __m128i ss[2];
  942|       |
  943|   760k|  ss[0] = _mm_shuffle_epi8(s, sfl0);
  944|   760k|  ss[1] = _mm_shuffle_epi8(s, sfl1);
  945|   760k|  return convolve_4tap_ssse3(ss, coeffs);
  946|   760k|}
convolve_2d_avx2.c:convolve_4tap_ssse3:
  438|  4.18M|                                          const __m128i coeffs[2]) {
  439|  4.18M|  const __m128i res_23 = _mm_maddubs_epi16(ss[0], coeffs[0]);
  440|  4.18M|  const __m128i res_45 = _mm_maddubs_epi16(ss[1], coeffs[1]);
  441|  4.18M|  return _mm_add_epi16(res_23, res_45);
  442|  4.18M|}
convolve_2d_avx2.c:x_convolve_4tap_4x2_ssse3:
  950|  3.42M|                                                const __m128i coeffs[2]) {
  951|  3.42M|  const __m128i s = load_u8_8x2_sse2(src, stride);
  952|  3.42M|  const __m128i sfl0 =
  953|  3.42M|      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
  954|  3.42M|  const __m128i sfl1 =
  955|  3.42M|      _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
  956|  3.42M|  __m128i ss[2];
  957|       |
  958|  3.42M|  ss[0] = _mm_shuffle_epi8(s, sfl0);
  959|  3.42M|  ss[1] = _mm_shuffle_epi8(s, sfl1);
  960|  3.42M|  return convolve_4tap_ssse3(ss, coeffs);
  961|  3.42M|}
convolve_2d_avx2.c:prepare_half_coeffs_4tap_avx2:
  179|  58.6k|    __m256i *const coeffs /* [2] */) {
  180|  58.6k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  181|  58.6k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  58.6k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  58.6k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  182|  58.6k|  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
  183|       |
  184|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
  185|       |  // This extra right shift will be taken care of at the end while rounding
  186|       |  // the result.
  187|       |  // Since all filter co-efficients are even, this change will not affect the
  188|       |  // end result
  189|  58.6k|  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
  190|  58.6k|                            _mm_set1_epi16((short)0xffff)));
  191|  58.6k|  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
  192|  58.6k|  populate_coeffs_4tap_avx2(coeffs_1, coeffs);
  193|  58.6k|}
convolve_2d_avx2.c:populate_coeffs_4tap_avx2:
   24|  58.6k|                                             __m256i coeffs[2]) {
   25|  58.6k|  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
   26|       |
   27|       |  // coeffs 2 3 2 3 2 3 2 3
   28|  58.6k|  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
   29|       |  // coeffs 4 5 4 5 4 5 4 5
   30|  58.6k|  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
   31|  58.6k|}
convolve_2d_avx2.c:x_convolve_4tap_8x2_avx2:
  966|   483k|                                               const __m256i filt[2]) {
  967|   483k|  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
  968|   483k|  return x_convolve_4tap_avx2(s_256, coeffs, filt);
  969|   483k|}
convolve_2d_avx2.c:x_convolve_4tap_avx2:
  562|  1.47M|                                           const __m256i filt[2]) {
  563|  1.47M|  __m256i ss[2];
  564|       |
  565|  1.47M|  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
  566|  1.47M|  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
  567|       |
  568|  1.47M|  return convolve_4tap_avx2(ss, coeffs);
  569|  1.47M|}
convolve_2d_avx2.c:convolve_4tap_avx2:
  470|  1.47M|                                         const __m256i coeffs[2]) {
  471|  1.47M|  const __m256i res_23 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
  472|  1.47M|  const __m256i res_45 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
  473|  1.47M|  return _mm256_add_epi16(res_23, res_45);
  474|  1.47M|}
convolve_2d_avx2.c:xy_x_round_store_8x2_avx2:
  637|  3.70M|                                             int16_t *const dst) {
  638|  3.70M|  const __m256i d = xy_x_round_avx2(res);
  639|  3.70M|  _mm256_storeu_si256((__m256i *)dst, d);
  640|  3.70M|}
convolve_2d_avx2.c:x_convolve_4tap_16x2_avx2:
  975|   141k|                                             __m256i r[2]) {
  976|   141k|  r[0] = x_convolve_4tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
  977|   141k|  r[1] = x_convolve_4tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
  978|   141k|}
convolve_2d_avx2.c:xy_x_4tap_32_avx2:
 1387|   494k|                                     int16_t *const dst) {
 1388|   494k|  __m256i r[2];
 1389|       |
 1390|   494k|  x_convolve_4tap_32_avx2(src, coeffs, filt, r);
 1391|   494k|  const __m256i d0 = xy_x_round_avx2(r[0]);
 1392|   494k|  const __m256i d1 = xy_x_round_avx2(r[1]);
 1393|   494k|  _mm256_storeu_si256((__m256i *)dst, d0);
 1394|   494k|  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
 1395|   494k|}
convolve_2d_avx2.c:x_convolve_4tap_32_avx2:
  983|   494k|                                           __m256i r[2]) {
  984|   494k|  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
  985|   494k|  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
  986|       |
  987|   494k|  r[0] = x_convolve_4tap_avx2(s0_256, coeffs, filt);
  988|   494k|  r[1] = x_convolve_4tap_avx2(s1_256, coeffs, filt);
  989|   494k|}
convolve_2d_avx2.c:prepare_half_coeffs_6tap_avx2:
  197|   890k|    __m256i *const coeffs /* [3] */) {
  198|   890k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  199|   890k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|   890k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|   890k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  200|   890k|  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
  201|       |
  202|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
  203|       |  // This extra right shift will be taken care of at the end while rounding
  204|       |  // the result.
  205|       |  // Since all filter co-efficients are even, this change will not affect the
  206|       |  // end result
  207|   890k|  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
  208|   890k|                            _mm_set1_epi16((short)0xffff)));
  209|   890k|  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
  210|   890k|  populate_coeffs_6tap_avx2(coeffs_1, coeffs);
  211|   890k|}
convolve_2d_avx2.c:populate_coeffs_6tap_avx2:
   34|   890k|                                             __m256i coeffs[3]) {
   35|   890k|  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
   36|       |
   37|       |  // coeffs 1 2 1 2 1 2 1 2
   38|   890k|  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0402u));
   39|       |  // coeffs 3 4 3 4 3 4 3 4
   40|   890k|  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0806u));
   41|       |  // coeffs 5 6 5 6 5 6 5 6
   42|   890k|  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0C0Au));
   43|   890k|}
convolve_2d_avx2.c:x_convolve_6tap_8x2_avx2:
 1031|  7.82M|                                               const __m256i filt[3]) {
 1032|  7.82M|  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
 1033|  7.82M|  return x_convolve_6tap_avx2(s_256, coeffs, filt);
 1034|  7.82M|}
convolve_2d_avx2.c:x_convolve_6tap_avx2:
  573|  15.7M|                                           const __m256i filt[3]) {
  574|  15.7M|  __m256i ss[3];
  575|       |
  576|  15.7M|  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
  577|  15.7M|  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
  578|  15.7M|  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
  579|       |
  580|  15.7M|  return convolve_6tap_avx2(ss, coeffs);
  581|  15.7M|}
convolve_2d_avx2.c:convolve_6tap_avx2:
  477|  15.7M|                                         const __m256i coeffs[3]) {
  478|  15.7M|  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
  479|  15.7M|  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
  480|  15.7M|  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
  481|  15.7M|  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
  482|  15.7M|  return _mm256_add_epi16(res_0145, res_23);
  483|  15.7M|}
convolve_2d_avx2.c:x_convolve_6tap_16x2_avx2:
 1040|  2.23M|                                             __m256i r[2]) {
 1041|  2.23M|  r[0] = x_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
 1042|  2.23M|  r[1] = x_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
 1043|  2.23M|}
convolve_2d_avx2.c:xy_x_6tap_32_avx2:
 1400|  3.97M|                                     int16_t *const dst) {
 1401|  3.97M|  __m256i r[2];
 1402|       |
 1403|  3.97M|  x_convolve_6tap_32_avx2(src, coeffs, filt, r);
 1404|  3.97M|  const __m256i d0 = xy_x_round_avx2(r[0]);
 1405|  3.97M|  const __m256i d1 = xy_x_round_avx2(r[1]);
 1406|  3.97M|  _mm256_storeu_si256((__m256i *)dst, d0);
 1407|  3.97M|  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
 1408|  3.97M|}
convolve_2d_avx2.c:x_convolve_6tap_32_avx2:
 1048|  3.97M|                                           __m256i r[2]) {
 1049|  3.97M|  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
 1050|  3.97M|  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
 1051|       |
 1052|  3.97M|  r[0] = x_convolve_6tap_avx2(s0_256, coeffs, filt);
 1053|  3.97M|  r[1] = x_convolve_6tap_avx2(s1_256, coeffs, filt);
 1054|  3.97M|}
convolve_2d_avx2.c:prepare_half_coeffs_8tap_avx2:
  215|  53.9k|    __m256i *const coeffs /* [4] */) {
  216|  53.9k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  217|  53.9k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  53.9k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  53.9k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  218|  53.9k|  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
  219|       |
  220|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
  221|       |  // This extra right shift will be taken care of at the end while rounding
  222|       |  // the result.
  223|       |  // Since all filter co-efficients are even, this change will not affect the
  224|       |  // end result
  225|  53.9k|  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
  226|  53.9k|                            _mm_set1_epi16((short)0xffff)));
  227|  53.9k|  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
  228|  53.9k|  populate_coeffs_8tap_avx2(coeffs_1, coeffs);
  229|  53.9k|}
convolve_2d_avx2.c:populate_coeffs_8tap_avx2:
   46|  53.9k|                                             __m256i coeffs[4]) {
   47|  53.9k|  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
   48|       |
   49|       |  // coeffs 0 1 0 1 0 1 0 1
   50|  53.9k|  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0200u));
   51|       |  // coeffs 2 3 2 3 2 3 2 3
   52|  53.9k|  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
   53|       |  // coeffs 4 5 4 5 4 5 4 5
   54|  53.9k|  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
   55|       |  // coeffs 6 7 6 7 6 7 6 7
   56|  53.9k|  coeffs[3] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0e0cu));
   57|  53.9k|}
convolve_2d_avx2.c:x_convolve_8tap_8x2_avx2:
 1059|   348k|                                               const __m256i filt[4]) {
 1060|   348k|  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
 1061|   348k|  return x_convolve_8tap_avx2(s_256, coeffs, filt);
 1062|   348k|}
convolve_2d_avx2.c:x_convolve_8tap_avx2:
  585|  2.56M|                                           const __m256i filt[4]) {
  586|  2.56M|  __m256i ss[4];
  587|       |
  588|  2.56M|  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
  589|  2.56M|  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
  590|  2.56M|  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
  591|  2.56M|  ss[3] = _mm256_shuffle_epi8(data, filt[3]);
  592|       |
  593|  2.56M|  return convolve_8tap_avx2(ss, coeffs);
  594|  2.56M|}
convolve_2d_avx2.c:convolve_8tap_avx2:
  486|  2.56M|                                         const __m256i coeffs[4]) {
  487|  2.56M|  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
  488|  2.56M|  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
  489|  2.56M|  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
  490|  2.56M|  const __m256i res_67 = _mm256_maddubs_epi16(ss[3], coeffs[3]);
  491|  2.56M|  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
  492|  2.56M|  const __m256i res_2367 = _mm256_add_epi16(res_23, res_67);
  493|  2.56M|  return _mm256_add_epi16(res_0145, res_2367);
  494|  2.56M|}
convolve_2d_avx2.c:x_convolve_8tap_16x2_avx2:
 1068|   100k|                                                       __m256i r[2]) {
 1069|   100k|  r[0] = x_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
 1070|   100k|  r[1] = x_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
 1071|   100k|}
convolve_2d_avx2.c:xy_x_8tap_32_avx2:
 1413|  1.10M|                                     int16_t *const dst) {
 1414|  1.10M|  __m256i r[2];
 1415|       |
 1416|  1.10M|  x_convolve_8tap_32_avx2(src, coeffs, filt, r);
 1417|  1.10M|  const __m256i d0 = xy_x_round_avx2(r[0]);
 1418|  1.10M|  const __m256i d1 = xy_x_round_avx2(r[1]);
 1419|  1.10M|  _mm256_storeu_si256((__m256i *)dst, d0);
 1420|  1.10M|  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
 1421|  1.10M|}
convolve_2d_avx2.c:x_convolve_8tap_32_avx2:
 1076|  1.10M|                                                     __m256i r[2]) {
 1077|  1.10M|  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
 1078|  1.10M|  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
 1079|       |
 1080|  1.10M|  r[0] = x_convolve_8tap_avx2(s0_256, coeffs, filt);
 1081|  1.10M|  r[1] = x_convolve_8tap_avx2(s1_256, coeffs, filt);
 1082|  1.10M|}
convolve_2d_avx2.c:xy_y_convolve_2tap_2x2_half_pel_sse2:
 1437|  3.03k|    const int16_t *const src, __m128i s_32[2]) {
 1438|  3.03k|  __m128i s_128[2];
 1439|       |
 1440|  3.03k|  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
 1441|  3.03k|  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
 1442|  3.03k|  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
 1443|  3.03k|  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
 1444|  3.03k|  return _mm_add_epi16(s_128[0], s_128[1]);
 1445|  3.03k|}
convolve_2d_avx2.c:xy_y_round_half_pel_sse2:
  662|  18.2k|static inline __m128i xy_y_round_half_pel_sse2(const __m128i src) {
  663|  18.2k|  const __m128i round = _mm_set1_epi16(16);
  664|  18.2k|  const __m128i dst = _mm_add_epi16(src, round);
  665|  18.2k|  return _mm_srai_epi16(dst, 5);
  666|  18.2k|}
convolve_2d_avx2.c:pack_store_2x2_sse2:
  687|   413k|                                       const ptrdiff_t stride) {
  688|   413k|  const __m128i d = _mm_packus_epi16(res, res);
  689|   413k|  *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d);
  690|   413k|  *(int16_t *)(dst + stride) = (int16_t)_mm_extract_epi16(d, 1);
  691|   413k|}
convolve_2d_avx2.c:xy_y_convolve_2tap_4x2_half_pel_sse2:
 1464|  15.1k|    const int16_t *const src, __m128i s_64[2]) {
 1465|  15.1k|  __m128i s_128[2];
 1466|       |
 1467|  15.1k|  s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
 1468|  15.1k|  s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
 1469|  15.1k|  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
 1470|  15.1k|  s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
 1471|  15.1k|  return _mm_add_epi16(s_128[0], s_128[1]);
 1472|  15.1k|}
convolve_2d_avx2.c:pack_store_4x2_sse2:
  694|  79.8k|                                       const ptrdiff_t stride) {
  695|  79.8k|  const __m128i d = _mm_packus_epi16(res, res);
  696|  79.8k|  store_u8_4x2_sse2(d, dst, stride);
  697|  79.8k|}
convolve_2d_avx2.c:xy_y_convolve_2tap_8x2_half_pel_avx2:
 1497|  17.0k|    const int16_t *const src, __m128i s_128[2]) {
 1498|  17.0k|  __m256i s_256[2];
 1499|  17.0k|  s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
 1500|  17.0k|  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
  ------------------
  |  |   29|  17.0k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  17.0k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1501|  17.0k|  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
 1502|  17.0k|  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
  ------------------
  |  |   29|  17.0k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  17.0k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1503|  17.0k|  return _mm256_add_epi16(s_256[0], s_256[1]);
 1504|  17.0k|}
convolve_2d_avx2.c:xy_y_round_half_pel_avx2:
  680|   315k|static inline __m256i xy_y_round_half_pel_avx2(const __m256i src) {
  681|   315k|  const __m256i round = _mm256_set1_epi16(16);
  682|   315k|  const __m256i dst = _mm256_add_epi16(src, round);
  683|   315k|  return _mm256_srai_epi16(dst, 5);
  684|   315k|}
convolve_2d_avx2.c:pack_store_8x2_avx2:
  710|  2.30M|                                       const ptrdiff_t stride) {
  711|  2.30M|  const __m256i d = _mm256_packus_epi16(res, res);
  712|  2.30M|  const __m128i d0 = _mm256_castsi256_si128(d);
  713|  2.30M|  const __m128i d1 = _mm256_extracti128_si256(d, 1);
  714|  2.30M|  _mm_storel_epi64((__m128i *)dst, d0);
  715|  2.30M|  _mm_storel_epi64((__m128i *)(dst + stride), d1);
  716|  2.30M|}
convolve_2d_avx2.c:xy_y_convolve_2tap_16x2_half_pel_avx2:
 1507|  16.4k|    const int16_t *const src, __m256i s_256[2], __m256i r[2]) {
 1508|  16.4k|  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 16));
 1509|  16.4k|  r[0] = _mm256_add_epi16(s_256[0], s_256[1]);
 1510|  16.4k|  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
 1511|  16.4k|  r[1] = _mm256_add_epi16(s_256[1], s_256[0]);
 1512|  16.4k|}
convolve_2d_avx2.c:xy_y_pack_store_16x2_avx2:
  728|  1.71M|                                             const ptrdiff_t stride) {
  729|  1.71M|  const __m256i t = _mm256_packus_epi16(res0, res1);
  730|  1.71M|  const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
  731|  1.71M|  storeu_u8_16x2_avx2(d, dst, stride);
  732|  1.71M|}
convolve_2d_avx2.c:xy_y_convolve_2tap_half_pel_32_all_avx2:
 1565|   132k|    uint8_t *const dst) {
 1566|   132k|  __m256i r[2];
 1567|       |
 1568|   132k|  xy_y_convolve_2tap_half_pel_32_avx2(src, s0, s1, r);
 1569|   132k|  r[0] = xy_y_round_half_pel_avx2(r[0]);
 1570|   132k|  r[1] = xy_y_round_half_pel_avx2(r[1]);
 1571|   132k|  xy_y_pack_store_32_avx2(r[0], r[1], dst);
 1572|   132k|}
convolve_2d_avx2.c:xy_y_convolve_2tap_half_pel_32_avx2:
 1556|   132k|                                                       __m256i r[2]) {
 1557|   132k|  s1[0] = _mm256_loadu_si256((__m256i *)src);
 1558|   132k|  s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
 1559|   132k|  r[0] = _mm256_add_epi16(s0[0], s1[0]);
 1560|   132k|  r[1] = _mm256_add_epi16(s0[1], s1[1]);
 1561|   132k|}
convolve_2d_avx2.c:xy_y_pack_store_32_avx2:
  759|  5.14M|                                           uint8_t *const dst) {
  760|  5.14M|  const __m256i d = _mm256_packus_epi16(res0, res1);
  761|       |  // d = _mm256_permute4x64_epi64(d, 0xD8);
  762|  5.14M|  _mm256_storeu_si256((__m256i *)dst, d);
  763|  5.14M|}
convolve_2d_avx2.c:load_16bit_8rows_avx2:
  377|    494|                                                   __m256i dst[8]) {
  378|    494|  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
  379|    494|  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
  380|    494|  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
  381|    494|  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
  382|    494|  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
  383|    494|  dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
  384|    494|  dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
  385|    494|  dst[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
  386|    494|}
convolve_2d_avx2.c:prepare_coeffs_2tap_sse2:
  233|  28.4k|    __m128i *const coeffs /* [1] */) {
  234|  28.4k|  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
  235|  28.4k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  28.4k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  28.4k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  236|       |
  237|  28.4k|  const __m128i coeff = _mm_cvtsi32_si128(loadu_int32(filter + 3));
  238|       |
  239|       |  // coeffs 3 4 3 4 3 4 3 4
  240|  28.4k|  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
  241|  28.4k|}
convolve_2d_avx2.c:xy_y_convolve_2tap_2x2_sse2:
 1425|  13.4k|                                                  const __m128i coeffs[1]) {
 1426|  13.4k|  __m128i s_128[2];
 1427|       |
 1428|  13.4k|  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
 1429|  13.4k|  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
 1430|  13.4k|  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
 1431|  13.4k|  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
 1432|  13.4k|  const __m128i ss = _mm_unpacklo_epi16(s_128[0], s_128[1]);
 1433|  13.4k|  return convolve16_2tap_sse2(&ss, coeffs);
 1434|  13.4k|}
convolve_2d_avx2.c:convolve16_2tap_sse2:
  497|   142k|                                           const __m128i coeffs[1]) {
  498|   142k|  return _mm_madd_epi16(ss[0], coeffs[0]);
  499|   142k|}
convolve_2d_avx2.c:xy_y_round_store_2x2_sse2:
  743|   410k|                                             const ptrdiff_t stride) {
  744|   410k|  const __m128i r = xy_y_round_sse2(res);
  745|   410k|  const __m128i rr = _mm_packs_epi32(r, r);
  746|   410k|  pack_store_2x2_sse2(rr, dst, stride);
  747|   410k|}
convolve_2d_avx2.c:xy_y_convolve_2tap_4x2_sse2:
 1450|  64.6k|                                               __m128i r[2]) {
 1451|  64.6k|  __m128i s_128[2];
 1452|       |
 1453|  64.6k|  s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
 1454|  64.6k|  s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
 1455|  64.6k|  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
 1456|  64.6k|  s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
 1457|  64.6k|  const __m128i ss0 = _mm_unpacklo_epi16(s_128[0], s_128[1]);
 1458|  64.6k|  const __m128i ss1 = _mm_unpackhi_epi16(s_128[0], s_128[1]);
 1459|  64.6k|  r[0] = convolve16_2tap_sse2(&ss0, coeffs);
 1460|  64.6k|  r[1] = convolve16_2tap_sse2(&ss1, coeffs);
 1461|  64.6k|}
convolve_2d_avx2.c:xy_y_round_sse2:
  656|   539k|static inline __m128i xy_y_round_sse2(const __m128i src) {
  657|   539k|  const __m128i round = _mm_set1_epi32(1024);
  658|   539k|  const __m128i dst = _mm_add_epi32(src, round);
  659|   539k|  return _mm_srai_epi32(dst, 11);
  660|   539k|}
convolve_2d_avx2.c:prepare_coeffs_2tap_avx2:
  292|  26.4k|    __m256i *const coeffs /* [1] */) {
  293|  26.4k|  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
  294|  26.4k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  26.4k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  26.4k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  295|       |
  296|  26.4k|  const __m128i coeff_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
  297|  26.4k|  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
  298|       |
  299|       |  // coeffs 3 4 3 4 3 4 3 4
  300|  26.4k|  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
  301|  26.4k|}
convolve_2d_avx2.c:xy_y_convolve_2tap_8x2_avx2:
 1487|  52.7k|                                               __m256i r[2]) {
 1488|  52.7k|  __m256i s_256[2];
 1489|  52.7k|  s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
 1490|  52.7k|  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
  ------------------
  |  |   29|  52.7k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  52.7k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1491|  52.7k|  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
 1492|  52.7k|  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
  ------------------
  |  |   29|  52.7k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  52.7k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1493|  52.7k|  xy_y_convolve_2tap_16_avx2(s_256[0], s_256[1], coeffs, r);
 1494|  52.7k|}
convolve_2d_avx2.c:xy_y_convolve_2tap_16_avx2:
 1477|   862k|                                              __m256i r[2]) {
 1478|   862k|  const __m256i ss0 = _mm256_unpacklo_epi16(s0, s1);
 1479|   862k|  const __m256i ss1 = _mm256_unpackhi_epi16(s0, s1);
 1480|   862k|  r[0] = convolve16_2tap_avx2(&ss0, coeffs);
 1481|   862k|  r[1] = convolve16_2tap_avx2(&ss1, coeffs);
 1482|   862k|}
convolve_2d_avx2.c:convolve16_2tap_avx2:
  529|  1.72M|                                           const __m256i coeffs[1]) {
  530|  1.72M|  return _mm256_madd_epi16(ss[0], coeffs[0]);
  531|  1.72M|}
convolve_2d_avx2.c:xy_y_round_store_8x2_avx2:
 1968|  2.28M|                                             const ptrdiff_t stride) {
 1969|  2.28M|  const __m256i r = xy_y_round_16_avx2(res);
 1970|  2.28M|  pack_store_8x2_avx2(r, dst, stride);
 1971|  2.28M|}
convolve_2d_avx2.c:xy_y_round_16_avx2:
  674|  15.6M|static inline __m256i xy_y_round_16_avx2(const __m256i r[2]) {
  675|  15.6M|  const __m256i r0 = xy_y_round_avx2(r[0]);
  676|  15.6M|  const __m256i r1 = xy_y_round_avx2(r[1]);
  677|  15.6M|  return _mm256_packs_epi32(r0, r1);
  678|  15.6M|}
convolve_2d_avx2.c:xy_y_round_avx2:
  668|  33.3M|static inline __m256i xy_y_round_avx2(const __m256i src) {
  669|  33.3M|  const __m256i round = _mm256_set1_epi32(1024);
  670|  33.3M|  const __m256i dst = _mm256_add_epi32(src, round);
  671|  33.3M|  return _mm256_srai_epi32(dst, 11);
  672|  33.3M|}
convolve_2d_avx2.c:xy_y_convolve_2tap_16x2_avx2:
 1524|  31.2k|                                                __m256i r[4]) {
 1525|  31.2k|  s[1] = _mm256_loadu_si256((__m256i *)(src + 16));
 1526|  31.2k|  xy_y_convolve_2tap_16_avx2(s[0], s[1], coeffs, r + 0);
 1527|  31.2k|  s[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
 1528|  31.2k|  xy_y_convolve_2tap_16_avx2(s[1], s[0], coeffs, r + 2);
 1529|  31.2k|}
convolve_2d_avx2.c:xy_y_round_store_16x2_avx2:
 1975|  1.69M|                                              const ptrdiff_t stride) {
 1976|  1.69M|  const __m256i r0 = xy_y_round_16_avx2(res + 0);
 1977|  1.69M|  const __m256i r1 = xy_y_round_16_avx2(res + 2);
 1978|  1.69M|  xy_y_pack_store_16x2_avx2(r0, r1, dst, stride);
 1979|  1.69M|}
convolve_2d_avx2.c:xy_y_convolve_2tap_32_all_avx2:
 1546|   280k|                                                  uint8_t *const dst) {
 1547|   280k|  __m256i r[4];
 1548|       |
 1549|   280k|  xy_y_convolve_2tap_32_avx2(src, s0, s1, coeffs, r);
 1550|   280k|  xy_y_round_store_32_avx2(r + 0, r + 2, dst);
 1551|   280k|}
convolve_2d_avx2.c:xy_y_convolve_2tap_32_avx2:
 1535|   280k|                                              __m256i r[4]) {
 1536|   280k|  s1[0] = _mm256_loadu_si256((__m256i *)src);
 1537|   280k|  s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
 1538|   280k|  xy_y_convolve_2tap_16_avx2(s0[0], s1[0], coeffs, r + 0);
 1539|   280k|  xy_y_convolve_2tap_16_avx2(s0[1], s1[1], coeffs, r + 2);
 1540|   280k|}
convolve_2d_avx2.c:xy_y_round_store_32_avx2:
  767|  5.01M|                                            uint8_t *const dst) {
  768|  5.01M|  const __m256i ra = xy_y_round_16_avx2(r0);
  769|  5.01M|  const __m256i rb = xy_y_round_16_avx2(r1);
  770|  5.01M|  xy_y_pack_store_32_avx2(ra, rb, dst);
  771|  5.01M|}
convolve_2d_avx2.c:prepare_coeffs_4tap_sse2:
  245|  86.6k|    __m128i *const coeffs /* [2] */) {
  246|  86.6k|  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
  247|  86.6k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  86.6k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  86.6k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  248|       |
  249|  86.6k|  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
  250|       |
  251|       |  // coeffs 2 3 2 3 2 3 2 3
  252|  86.6k|  coeffs[0] = _mm_shuffle_epi32(coeff, 0x55);
  253|       |  // coeffs 4 5 4 5 4 5 4 5
  254|  86.6k|  coeffs[1] = _mm_shuffle_epi32(coeff, 0xaa);
  255|  86.6k|}
convolve_2d_avx2.c:xy_y_convolve_4tap_2x2_sse2:
 1577|   146k|                                                  const __m128i coeffs[2]) {
 1578|   146k|  s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + 3 * 2));
 1579|   146k|  const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
 1580|   146k|  s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 4 * 2));
 1581|   146k|  const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
 1582|   146k|  ss_128[1] = _mm_unpacklo_epi16(src23, src34);
 1583|   146k|  const __m128i r = convolve16_4tap_sse2(ss_128, coeffs);
 1584|   146k|  ss_128[0] = ss_128[1];
 1585|   146k|  return r;
 1586|   146k|}
convolve_2d_avx2.c:convolve16_4tap_sse2:
  502|   146k|                                           const __m128i coeffs[2]) {
  503|   146k|  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
  504|   146k|  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
  505|   146k|  return _mm_add_epi32(res_01, res_23);
  506|   146k|}
convolve_2d_avx2.c:prepare_coeffs_4tap_avx2:
  305|   792k|    __m256i *const coeffs /* [2] */) {
  306|   792k|  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
  307|   792k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|   792k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|   792k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  308|       |
  309|   792k|  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
  310|   792k|  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
  311|       |
  312|       |  // coeffs 2 3 2 3 2 3 2 3
  313|   792k|  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
  314|       |  // coeffs 4 5 4 5 4 5 4 5
  315|   792k|  coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa);
  316|   792k|}
convolve_2d_avx2.c:xy_y_convolve_4tap_4x2_avx2:
 1591|   726k|                                                  const __m256i coeffs[2]) {
 1592|   726k|  __m256i s_256[2];
 1593|   726k|  s_64[3] = _mm_loadl_epi64((__m128i *)(src + 3 * 4));
 1594|   726k|  s_256[0] = _mm256_setr_m128i(s_64[2], s_64[3]);
  ------------------
  |  |   29|   726k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|   726k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1595|   726k|  s_64[2] = _mm_loadl_epi64((__m128i *)(src + 4 * 4));
 1596|   726k|  s_256[1] = _mm256_setr_m128i(s_64[3], s_64[2]);
  ------------------
  |  |   29|   726k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|   726k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1597|   726k|  ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
 1598|   726k|  const __m256i r = convolve16_4tap_avx2(ss_256, coeffs);
 1599|   726k|  ss_256[0] = ss_256[1];
 1600|   726k|  return r;
 1601|   726k|}
convolve_2d_avx2.c:convolve16_4tap_avx2:
  534|  6.65M|                                           const __m256i coeffs[2]) {
  535|  6.65M|  const __m256i res_1 = _mm256_madd_epi16(ss[0], coeffs[0]);
  536|  6.65M|  const __m256i res_2 = _mm256_madd_epi16(ss[1], coeffs[1]);
  537|  6.65M|  return _mm256_add_epi32(res_1, res_2);
  538|  6.65M|}
convolve_2d_avx2.c:xy_y_round_store_4x2_avx2:
  751|  1.98M|                                             const ptrdiff_t stride) {
  752|  1.98M|  const __m256i r = xy_y_round_avx2(res);
  753|  1.98M|  const __m256i rr = _mm256_packs_epi32(r, r);
  754|  1.98M|  pack_store_4x2_avx2(rr, dst, stride);
  755|  1.98M|}
convolve_2d_avx2.c:pack_store_4x2_avx2:
  700|  1.98M|                                       const ptrdiff_t stride) {
  701|  1.98M|  const __m256i d = _mm256_packus_epi16(res, res);
  702|  1.98M|  const __m128i d0 = _mm256_castsi256_si128(d);
  703|  1.98M|  const __m128i d1 = _mm256_extracti128_si256(d, 1);
  704|       |
  705|  1.98M|  xx_storel_32(dst, d0);
  706|  1.98M|  xx_storel_32(dst + stride, d1);
  707|  1.98M|}
convolve_2d_avx2.c:xy_y_convolve_4tap_8x2_avx2:
 1613|   482k|                                               __m256i r[2]) {
 1614|   482k|  __m256i s_256[2];
 1615|   482k|  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
 1616|   482k|  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
 1617|   482k|  ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
 1618|   482k|  ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
 1619|   482k|  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
 1620|   482k|  ss_256[0] = ss_256[1];
 1621|   482k|  ss_256[2] = ss_256[3];
 1622|   482k|}
convolve_2d_avx2.c:xy_y_convolve_4tap_16_avx2:
 1605|  2.96M|                                              __m256i r[2]) {
 1606|  2.96M|  r[0] = convolve16_4tap_avx2(ss, coeffs);
 1607|  2.96M|  r[1] = convolve16_4tap_avx2(ss + 2, coeffs);
 1608|  2.96M|}
convolve_2d_avx2.c:xy_y_convolve_4tap_8x2_half_pel_avx2:
 1626|  88.0k|    __m256i r[2]) {
 1627|  88.0k|  __m256i a_256[2];
 1628|  88.0k|  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
 1629|  88.0k|  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
 1630|  88.0k|  a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
 1631|  88.0k|  a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
 1632|  88.0k|  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r);
 1633|  88.0k|  s_256[0] = s_256[2];
 1634|  88.0k|  s_256[1] = s_256[3];
 1635|  88.0k|}
convolve_2d_avx2.c:xy_y_convolve_4tap_16x2_avx2:
 1639|   266k|    __m256i tt_256[4], const __m256i coeffs[2], __m256i r[4]) {
 1640|   266k|  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
 1641|   266k|  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
 1642|   266k|  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
 1643|   266k|  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
 1644|   266k|  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
 1645|   266k|  tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
 1646|   266k|  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
 1647|   266k|  xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
 1648|   266k|  ss_256[0] = ss_256[1];
 1649|   266k|  ss_256[2] = ss_256[3];
 1650|   266k|  tt_256[0] = tt_256[1];
 1651|   266k|  tt_256[2] = tt_256[3];
 1652|   266k|}
convolve_2d_avx2.c:xy_y_convolve_4tap_16x2_half_pelavx2:
 1674|  49.0k|    __m256i r[4]) {
 1675|  49.0k|  __m256i a_256[2];
 1676|       |
 1677|  49.0k|  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
 1678|  49.0k|  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
 1679|       |
 1680|  49.0k|  a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
 1681|  49.0k|  a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
 1682|  49.0k|  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 0);
 1683|       |
 1684|  49.0k|  a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
 1685|  49.0k|  a_256[1] = _mm256_add_epi16(s_256[2], s_256[3]);
 1686|  49.0k|  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 2);
 1687|       |
 1688|  49.0k|  s_256[0] = s_256[2];
 1689|  49.0k|  s_256[1] = s_256[3];
 1690|  49.0k|  s_256[2] = s_256[4];
 1691|  49.0k|}
convolve_2d_avx2.c:loadu_unpack_16bit_3rows_avx2:
  410|  27.1k|    __m256i ss_256[3], __m256i tt_256[3]) {
  411|  27.1k|  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
  412|  27.1k|  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
  413|  27.1k|  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
  414|       |
  415|  27.1k|  ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
  416|  27.1k|  ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
  417|       |
  418|  27.1k|  tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
  419|  27.1k|  tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
  420|  27.1k|}
convolve_2d_avx2.c:xy_y_convolve_4tap_32x2_avx2:
 1657|   444k|    __m256i r[4]) {
 1658|   444k|  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
 1659|   444k|  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
 1660|   444k|  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
 1661|   444k|  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
 1662|   444k|  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
 1663|   444k|  tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
 1664|   444k|  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
 1665|   444k|  xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
 1666|   444k|  ss_256[0] = ss_256[1];
 1667|   444k|  ss_256[2] = ss_256[3];
 1668|   444k|  tt_256[0] = tt_256[1];
 1669|   444k|  tt_256[2] = tt_256[3];
 1670|   444k|}
convolve_2d_avx2.c:prepare_coeffs_6tap_ssse3:
  259|  59.7k|    __m128i *const coeffs /* [3] */) {
  260|  59.7k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  261|  59.7k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  59.7k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  59.7k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  262|  59.7k|  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
  263|       |
  264|       |  // coeffs 1 2 1 2 1 2 1 2
  265|  59.7k|  coeffs[0] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x05040302u));
  266|       |  // coeffs 3 4 3 4 3 4 3 4
  267|  59.7k|  coeffs[1] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x09080706u));
  268|       |  // coeffs 5 6 5 6 5 6 5 6
  269|  59.7k|  coeffs[2] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x0D0C0B0Au));
  270|  59.7k|}
convolve_2d_avx2.c:xy_y_convolve_6tap_2x2_sse2:
 1696|   239k|                                                  const __m128i coeffs[3]) {
 1697|   239k|  s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 5 * 2));
 1698|   239k|  const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
 1699|   239k|  s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 6 * 2));
 1700|   239k|  const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
 1701|   239k|  ss_128[2] = _mm_unpacklo_epi16(src45, src56);
 1702|   239k|  const __m128i r = convolve16_6tap_sse2(ss_128, coeffs);
 1703|   239k|  ss_128[0] = ss_128[1];
 1704|   239k|  ss_128[1] = ss_128[2];
 1705|   239k|  return r;
 1706|   239k|}
convolve_2d_avx2.c:convolve16_6tap_sse2:
  509|   239k|                                           const __m128i coeffs[3]) {
  510|   239k|  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
  511|   239k|  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
  512|   239k|  const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
  513|   239k|  const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
  514|   239k|  return _mm_add_epi32(res_0123, res_45);
  515|   239k|}
convolve_2d_avx2.c:prepare_coeffs_6tap_avx2:
  320|   758k|    __m256i *const coeffs /* [3]*/) {
  321|   758k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  322|   758k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|   758k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|   758k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  323|   758k|  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
  324|   758k|  const __m256i coeff = _mm256_broadcastsi128_si256(coeffs_8);
  325|       |
  326|       |  // coeffs 1 2 1 2 1 2 1 2
  327|   758k|  coeffs[0] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x05040302u));
  328|       |  // coeffs 3 4 3 4 3 4 3 4
  329|   758k|  coeffs[1] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x09080706u));
  330|       |  // coeffs 5 6 5 6 5 6 5 6
  331|   758k|  coeffs[2] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x0D0C0B0Au));
  332|   758k|}
convolve_2d_avx2.c:xy_y_convolve_6tap_4x2_avx2:
 1711|  1.20M|                                                  const __m256i coeffs[3]) {
 1712|  1.20M|  __m256i s_256[2];
 1713|  1.20M|  s_64[5] = _mm_loadl_epi64((__m128i *)(src + 5 * 4));
 1714|  1.20M|  s_256[0] = _mm256_setr_m128i(s_64[4], s_64[5]);
  ------------------
  |  |   29|  1.20M|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  1.20M|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1715|  1.20M|  s_64[4] = _mm_loadl_epi64((__m128i *)(src + 6 * 4));
 1716|  1.20M|  s_256[1] = _mm256_setr_m128i(s_64[5], s_64[4]);
  ------------------
  |  |   29|  1.20M|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  1.20M|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1717|  1.20M|  ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
 1718|  1.20M|  const __m256i r = convolve16_6tap_avx2(ss_256, coeffs);
 1719|  1.20M|  ss_256[0] = ss_256[1];
 1720|  1.20M|  ss_256[1] = ss_256[2];
 1721|  1.20M|  return r;
 1722|  1.20M|}
convolve_2d_avx2.c:convolve16_6tap_avx2:
  541|  21.2M|                                           const __m256i coeffs[3]) {
  542|  21.2M|  const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
  543|  21.2M|  const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
  544|  21.2M|  const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
  545|  21.2M|  const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
  546|  21.2M|  return _mm256_add_epi32(res_0123, res_45);
  547|  21.2M|}
convolve_2d_avx2.c:xy_y_convolve_6tap_8x2_avx2:
 1734|  1.23M|                                               __m256i r[2]) {
 1735|  1.23M|  __m256i s_256[2];
 1736|  1.23M|  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
 1737|  1.23M|  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
 1738|  1.23M|  ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
 1739|  1.23M|  ss_256[5] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
 1740|  1.23M|  xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r);
 1741|  1.23M|  ss_256[0] = ss_256[1];
 1742|  1.23M|  ss_256[1] = ss_256[2];
 1743|  1.23M|  ss_256[3] = ss_256[4];
 1744|  1.23M|  ss_256[4] = ss_256[5];
 1745|  1.23M|}
convolve_2d_avx2.c:xy_y_convolve_6tap_16_avx2:
 1726|  10.0M|                                              __m256i r[2]) {
 1727|  10.0M|  r[0] = convolve16_6tap_avx2(ss, coeffs);
 1728|  10.0M|  r[1] = convolve16_6tap_avx2(ss + 3, coeffs);
 1729|  10.0M|}
convolve_2d_avx2.c:xy_y_convolve_6tap_8x2_half_pel_avx2:
 1749|   358k|    __m256i r[2]) {
 1750|   358k|  __m256i a_256[2], ss_256[4];
 1751|   358k|  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
 1752|   358k|  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
 1753|   358k|  a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
 1754|   358k|  a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
 1755|   358k|  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
 1756|   358k|  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
 1757|   358k|  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
 1758|   358k|  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
 1759|   358k|  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
 1760|   358k|  s_256[0] = s_256[2];
 1761|   358k|  s_256[1] = s_256[3];
 1762|   358k|  s_256[2] = s_256[4];
 1763|   358k|  s_256[3] = s_256[5];
 1764|   358k|}
convolve_2d_avx2.c:xy_y_convolve_6tap_16x2_avx2:
 1769|  4.38M|    __m256i r[4]) {
 1770|  4.38M|  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
 1771|  4.38M|  ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
 1772|  4.38M|  ss_256[5] = _mm256_unpackhi_epi16(s_256[4], s_256[5]);
 1773|  4.38M|  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
 1774|  4.38M|  tt_256[2] = _mm256_unpacklo_epi16(s_256[5], s_256[4]);
 1775|  4.38M|  tt_256[5] = _mm256_unpackhi_epi16(s_256[5], s_256[4]);
 1776|       |
 1777|  4.38M|  xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r + 0);
 1778|  4.38M|  xy_y_convolve_6tap_16_avx2(tt_256, coeffs, r + 2);
 1779|       |
 1780|  4.38M|  ss_256[0] = ss_256[1];
 1781|  4.38M|  ss_256[1] = ss_256[2];
 1782|  4.38M|  ss_256[3] = ss_256[4];
 1783|  4.38M|  ss_256[4] = ss_256[5];
 1784|       |
 1785|  4.38M|  tt_256[0] = tt_256[1];
 1786|  4.38M|  tt_256[1] = tt_256[2];
 1787|  4.38M|  tt_256[3] = tt_256[4];
 1788|  4.38M|  tt_256[4] = tt_256[5];
 1789|  4.38M|}
convolve_2d_avx2.c:xy_y_convolve_6tap_16x2_half_pel_avx2:
 1793|   321k|    __m256i ss_256[4], const __m256i coeffs[2], __m256i r[4]) {
 1794|   321k|  __m256i a_256[2];
 1795|       |
 1796|   321k|  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
 1797|   321k|  a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
 1798|   321k|  a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
 1799|   321k|  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
 1800|   321k|  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
 1801|   321k|  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
 1802|   321k|  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
 1803|   321k|  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
 1804|       |
 1805|   321k|  a_256[1] = _mm256_add_epi16(s_256[2], s_256[5]);
 1806|   321k|  s_256[0] = s_256[2];
 1807|   321k|  s_256[2] = s_256[4];
 1808|   321k|  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
 1809|   321k|  a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
 1810|   321k|  s_256[1] = s_256[3];
 1811|   321k|  s_256[3] = s_256[5];
 1812|   321k|  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
 1813|   321k|  ss_256[1] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
 1814|   321k|  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
 1815|   321k|  ss_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
 1816|   321k|  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
 1817|   321k|}
convolve_2d_avx2.c:loadu_unpack_16bit_5rows_avx2:
  390|   186k|    __m256i ss_256[5], __m256i tt_256[5]) {
  391|   186k|  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
  392|   186k|  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
  393|   186k|  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
  394|   186k|  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
  395|   186k|  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
  396|       |
  397|   186k|  ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
  398|   186k|  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
  399|   186k|  ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
  400|   186k|  ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
  401|       |
  402|   186k|  tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
  403|   186k|  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
  404|   186k|  tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
  405|   186k|  tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
  406|   186k|}
convolve_2d_avx2.c:prepare_coeffs_8tap_sse2:
  274|  2.74k|    __m128i *const coeffs /* [4] */) {
  275|  2.74k|  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
  276|  2.74k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  2.74k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  2.74k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  277|       |
  278|  2.74k|  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
  279|       |
  280|       |  // coeffs 0 1 0 1 0 1 0 1
  281|  2.74k|  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
  282|       |  // coeffs 2 3 2 3 2 3 2 3
  283|  2.74k|  coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
  284|       |  // coeffs 4 5 4 5 4 5 4 5
  285|  2.74k|  coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
  286|       |  // coeffs 6 7 6 7 6 7 6 7
  287|  2.74k|  coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
  288|  2.74k|}
convolve_2d_avx2.c:xy_y_convolve_8tap_2x2_sse2:
 1822|  10.9k|                                                  const __m128i coeffs[4]) {
 1823|  10.9k|  s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * 2));
 1824|  10.9k|  const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
 1825|  10.9k|  s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * 2));
 1826|  10.9k|  const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
 1827|  10.9k|  ss_128[3] = _mm_unpacklo_epi16(src67, src78);
 1828|  10.9k|  const __m128i r = convolve16_8tap_sse2(ss_128, coeffs);
 1829|  10.9k|  ss_128[0] = ss_128[1];
 1830|  10.9k|  ss_128[1] = ss_128[2];
 1831|  10.9k|  ss_128[2] = ss_128[3];
 1832|  10.9k|  return r;
 1833|  10.9k|}
convolve_2d_avx2.c:convolve16_8tap_sse2:
  518|  10.9k|                                           const __m128i coeffs[4]) {
  519|  10.9k|  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
  520|  10.9k|  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
  521|  10.9k|  const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
  522|  10.9k|  const __m128i res_67 = _mm_madd_epi16(ss[3], coeffs[3]);
  523|  10.9k|  const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
  524|  10.9k|  const __m128i res_4567 = _mm_add_epi32(res_45, res_67);
  525|  10.9k|  return _mm_add_epi32(res_0123, res_4567);
  526|  10.9k|}
convolve_2d_avx2.c:prepare_coeffs_8tap_avx2:
  336|  49.4k|    __m256i *const coeffs /* [4] */) {
  337|  49.4k|  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
  338|  49.4k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  49.4k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  49.4k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  339|       |
  340|  49.4k|  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
  341|  49.4k|  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
  342|       |
  343|       |  // coeffs 0 1 0 1 0 1 0 1
  344|  49.4k|  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
  345|       |  // coeffs 2 3 2 3 2 3 2 3
  346|  49.4k|  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
  347|       |  // coeffs 4 5 4 5 4 5 4 5
  348|  49.4k|  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
  349|       |  // coeffs 6 7 6 7 6 7 6 7
  350|  49.4k|  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
  351|  49.4k|}
convolve_2d_avx2.c:xy_y_convolve_8tap_4x2_avx2:
 1838|  52.6k|                                                  const __m256i coeffs[4]) {
 1839|  52.6k|  __m256i s_256[2];
 1840|  52.6k|  s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * 4));
 1841|  52.6k|  s_256[0] = _mm256_setr_m128i(s_64[6], s_64[7]);
  ------------------
  |  |   29|  52.6k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  52.6k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1842|  52.6k|  s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * 4));
 1843|  52.6k|  s_256[1] = _mm256_setr_m128i(s_64[7], s_64[6]);
  ------------------
  |  |   29|  52.6k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  52.6k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1844|  52.6k|  ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
 1845|  52.6k|  const __m256i r = convolve16_8tap_avx2(ss_256, coeffs);
 1846|  52.6k|  ss_256[0] = ss_256[1];
 1847|  52.6k|  ss_256[1] = ss_256[2];
 1848|  52.6k|  ss_256[2] = ss_256[3];
 1849|  52.6k|  return r;
 1850|  52.6k|}
convolve_2d_avx2.c:convolve16_8tap_avx2:
  550|  3.78M|                                           const __m256i coeffs[4]) {
  551|  3.78M|  const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
  552|  3.78M|  const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
  553|  3.78M|  const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
  554|  3.78M|  const __m256i res_67 = _mm256_madd_epi16(ss[3], coeffs[3]);
  555|  3.78M|  const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
  556|  3.78M|  const __m256i res_4567 = _mm256_add_epi32(res_45, res_67);
  557|  3.78M|  return _mm256_add_epi32(res_0123, res_4567);
  558|  3.78M|}
convolve_2d_avx2.c:convolve_8tap_unpack_avx2:
  423|   133k|                                             __m256i ss[7]) {
  424|   133k|  ss[0] = _mm256_unpacklo_epi16(s[0], s[1]);
  425|   133k|  ss[1] = _mm256_unpacklo_epi16(s[2], s[3]);
  426|   133k|  ss[2] = _mm256_unpacklo_epi16(s[4], s[5]);
  427|   133k|  ss[4] = _mm256_unpackhi_epi16(s[0], s[1]);
  428|   133k|  ss[5] = _mm256_unpackhi_epi16(s[2], s[3]);
  429|   133k|  ss[6] = _mm256_unpackhi_epi16(s[4], s[5]);
  430|   133k|}
convolve_2d_avx2.c:xy_y_convolve_8tap_8x2_avx2:
 1862|  43.1k|                                               __m256i r[2]) {
 1863|  43.1k|  __m256i s_256[2];
 1864|  43.1k|  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
 1865|  43.1k|  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
 1866|  43.1k|  ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
 1867|  43.1k|  ss_256[7] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
 1868|  43.1k|  xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r);
 1869|  43.1k|  ss_256[0] = ss_256[1];
 1870|  43.1k|  ss_256[1] = ss_256[2];
 1871|  43.1k|  ss_256[2] = ss_256[3];
 1872|  43.1k|  ss_256[4] = ss_256[5];
 1873|  43.1k|  ss_256[5] = ss_256[6];
 1874|  43.1k|  ss_256[6] = ss_256[7];
 1875|  43.1k|}
convolve_2d_avx2.c:xy_y_convolve_8tap_16_avx2:
 1854|  1.86M|                                              __m256i r[2]) {
 1855|  1.86M|  r[0] = convolve16_8tap_avx2(ss, coeffs);
 1856|  1.86M|  r[1] = convolve16_8tap_avx2(ss + 4, coeffs);
 1857|  1.86M|}
convolve_2d_avx2.c:xy_y_convolve_8tap_8x2_half_pel_avx2:
 1879|  22.0k|    __m256i r[2]) {
 1880|  22.0k|  __m256i a_256[4], ss_256[4];
 1881|       |
 1882|  22.0k|  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
 1883|  22.0k|  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
 1884|  22.0k|  a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
 1885|  22.0k|  a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
 1886|  22.0k|  a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
 1887|  22.0k|  a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
 1888|  22.0k|  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
 1889|  22.0k|  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
 1890|  22.0k|  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
 1891|  22.0k|  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
 1892|  22.0k|  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
 1893|  22.0k|  s_256[0] = s_256[2];
 1894|  22.0k|  s_256[1] = s_256[3];
 1895|  22.0k|  s_256[2] = s_256[4];
 1896|  22.0k|  s_256[3] = s_256[5];
 1897|  22.0k|  s_256[4] = s_256[6];
 1898|  22.0k|  s_256[5] = s_256[7];
 1899|  22.0k|}
convolve_2d_avx2.c:load_16bit_7rows_avx2:
  365|  65.4k|                                         __m256i dst[7]) {
  366|  65.4k|  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
  367|  65.4k|  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
  368|  65.4k|  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
  369|  65.4k|  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
  370|  65.4k|  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
  371|  65.4k|  dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
  372|  65.4k|  dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
  373|  65.4k|}
convolve_2d_avx2.c:xy_y_convolve_8tap_16x2_avx2:
 1903|   911k|    __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
 1904|   911k|  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
 1905|   911k|  ss_256[3] = _mm256_unpacklo_epi16(s_256[6], s_256[7]);
 1906|   911k|  ss_256[7] = _mm256_unpackhi_epi16(s_256[6], s_256[7]);
 1907|   911k|  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
 1908|   911k|  tt_256[3] = _mm256_unpacklo_epi16(s_256[7], s_256[6]);
 1909|   911k|  tt_256[7] = _mm256_unpackhi_epi16(s_256[7], s_256[6]);
 1910|       |
 1911|   911k|  xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r + 0);
 1912|   911k|  xy_y_convolve_8tap_16_avx2(tt_256, coeffs, r + 2);
 1913|       |
 1914|   911k|  ss_256[0] = ss_256[1];
 1915|   911k|  ss_256[1] = ss_256[2];
 1916|   911k|  ss_256[2] = ss_256[3];
 1917|   911k|  ss_256[4] = ss_256[5];
 1918|   911k|  ss_256[5] = ss_256[6];
 1919|   911k|  ss_256[6] = ss_256[7];
 1920|       |
 1921|   911k|  tt_256[0] = tt_256[1];
 1922|   911k|  tt_256[1] = tt_256[2];
 1923|   911k|  tt_256[2] = tt_256[3];
 1924|   911k|  tt_256[4] = tt_256[5];
 1925|   911k|  tt_256[5] = tt_256[6];
 1926|   911k|  tt_256[6] = tt_256[7];
 1927|   911k|}
convolve_2d_avx2.c:xy_y_convolve_8tap_16x2_half_pel_avx2:
 1931|  17.4k|    __m256i s_256[8], __m256i r[4]) {
 1932|  17.4k|  __m256i a_256[4], ss_256[4];
 1933|  17.4k|  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
 1934|       |
 1935|  17.4k|  a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
 1936|  17.4k|  a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
 1937|  17.4k|  a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
 1938|  17.4k|  a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
 1939|  17.4k|  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
 1940|  17.4k|  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
 1941|  17.4k|  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
 1942|  17.4k|  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
 1943|       |
 1944|  17.4k|  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
 1945|       |
 1946|  17.4k|  a_256[1] = _mm256_add_epi16(s_256[2], s_256[7]);
 1947|  17.4k|  a_256[2] = _mm256_add_epi16(s_256[3], s_256[6]);
 1948|  17.4k|  a_256[3] = _mm256_add_epi16(s_256[4], s_256[5]);
 1949|  17.4k|  s_256[0] = s_256[2];
 1950|  17.4k|  s_256[2] = s_256[4];
 1951|  17.4k|  s_256[4] = s_256[6];
 1952|  17.4k|  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
 1953|       |
 1954|  17.4k|  a_256[0] = _mm256_add_epi16(s_256[1], s_256[6]);
 1955|  17.4k|  s_256[1] = s_256[3];
 1956|  17.4k|  s_256[3] = s_256[5];
 1957|  17.4k|  s_256[5] = s_256[7];
 1958|  17.4k|  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
 1959|  17.4k|  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
 1960|  17.4k|  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
 1961|  17.4k|  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
 1962|       |
 1963|  17.4k|  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
 1964|  17.4k|}
convolve_avx2.c:av1_convolve_y_sr_specialized_avx2:
 2008|   747k|    const int32_t subpel_y_q4) {
 2009|   747k|  int32_t x, y;
 2010|   747k|  __m128i coeffs_128[4];
 2011|   747k|  __m256i coeffs_256[4];
 2012|       |
 2013|   747k|  int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4);
 2014|       |
 2015|   747k|  if (vert_tap == 2) {
  ------------------
  |  Branch (2015:7): [True: 37.2k, False: 710k]
  ------------------
 2016|       |    // vert_filt as 2 tap
 2017|  37.2k|    const uint8_t *src_ptr = src;
 2018|       |
 2019|  37.2k|    y = h;
 2020|       |
 2021|  37.2k|    if (subpel_y_q4 != 8) {
  ------------------
  |  Branch (2021:9): [True: 22.8k, False: 14.4k]
  ------------------
 2022|  22.8k|      if (w <= 8) {
  ------------------
  |  Branch (2022:11): [True: 17.5k, False: 5.29k]
  ------------------
 2023|  17.5k|        prepare_half_coeffs_2tap_ssse3(filter_params_y, subpel_y_q4,
 2024|  17.5k|                                       coeffs_128);
 2025|       |
 2026|  17.5k|        if (w == 2) {
  ------------------
  |  Branch (2026:13): [True: 2.84k, False: 14.6k]
  ------------------
 2027|  2.84k|          __m128i s_16[2];
 2028|       |
 2029|  2.84k|          s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
 2030|       |
 2031|  5.59k|          do {
 2032|  5.59k|            const __m128i res = y_convolve_2tap_2x2_ssse3(src_ptr, src_stride,
 2033|  5.59k|                                                          coeffs_128, s_16);
 2034|  5.59k|            const __m128i r = sr_y_round_sse2(res);
 2035|  5.59k|            pack_store_2x2_sse2(r, dst, dst_stride);
 2036|  5.59k|            src_ptr += 2 * src_stride;
 2037|  5.59k|            dst += 2 * dst_stride;
 2038|  5.59k|            y -= 2;
 2039|  5.59k|          } while (y);
  ------------------
  |  Branch (2039:20): [True: 2.74k, False: 2.84k]
  ------------------
 2040|  14.6k|        } else if (w == 4) {
  ------------------
  |  Branch (2040:20): [True: 8.80k, False: 5.86k]
  ------------------
 2041|  8.80k|          __m128i s_32[2];
 2042|       |
 2043|  8.80k|          s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
 2044|       |
 2045|  26.0k|          do {
 2046|  26.0k|            const __m128i res = y_convolve_2tap_4x2_ssse3(src_ptr, src_stride,
 2047|  26.0k|                                                          coeffs_128, s_32);
 2048|  26.0k|            const __m128i r = sr_y_round_sse2(res);
 2049|  26.0k|            pack_store_4x2_sse2(r, dst, dst_stride);
 2050|  26.0k|            src_ptr += 2 * src_stride;
 2051|  26.0k|            dst += 2 * dst_stride;
 2052|  26.0k|            y -= 2;
 2053|  26.0k|          } while (y);
  ------------------
  |  Branch (2053:20): [True: 17.1k, False: 8.80k]
  ------------------
 2054|  8.80k|        } else {
 2055|  5.86k|          __m128i s_64[2], s_128[2];
 2056|       |
 2057|  5.86k|          assert(w == 8);
 2058|       |
 2059|  5.86k|          s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
 2060|       |
 2061|  20.2k|          do {
 2062|       |            // Note: Faster than binding to AVX2 registers.
 2063|  20.2k|            s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
 2064|  20.2k|            s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
 2065|  20.2k|            s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
 2066|  20.2k|            s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
 2067|  20.2k|            const __m128i ss0 = _mm_unpacklo_epi8(s_128[0], s_128[1]);
 2068|  20.2k|            const __m128i ss1 = _mm_unpackhi_epi8(s_128[0], s_128[1]);
 2069|  20.2k|            const __m128i res0 = convolve_2tap_ssse3(&ss0, coeffs_128);
 2070|  20.2k|            const __m128i res1 = convolve_2tap_ssse3(&ss1, coeffs_128);
 2071|  20.2k|            const __m128i r0 = sr_y_round_sse2(res0);
 2072|  20.2k|            const __m128i r1 = sr_y_round_sse2(res1);
 2073|  20.2k|            const __m128i d = _mm_packus_epi16(r0, r1);
 2074|  20.2k|            _mm_storel_epi64((__m128i *)dst, d);
 2075|  20.2k|            _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
 2076|  20.2k|            src_ptr += 2 * src_stride;
 2077|  20.2k|            dst += 2 * dst_stride;
 2078|  20.2k|            y -= 2;
 2079|  20.2k|          } while (y);
  ------------------
  |  Branch (2079:20): [True: 14.3k, False: 5.86k]
  ------------------
 2080|  5.86k|        }
 2081|  17.5k|      } else {
 2082|  5.29k|        prepare_half_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
 2083|       |
 2084|  5.29k|        if (w == 16) {
  ------------------
  |  Branch (2084:13): [True: 2.99k, False: 2.30k]
  ------------------
 2085|  2.99k|          __m128i s_128[2];
 2086|       |
 2087|  2.99k|          s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
 2088|       |
 2089|  18.8k|          do {
 2090|  18.8k|            __m256i r[2];
 2091|       |
 2092|  18.8k|            y_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
 2093|  18.8k|                                      r);
 2094|  18.8k|            sr_y_round_store_16x2_avx2(r, dst, dst_stride);
 2095|  18.8k|            src_ptr += 2 * src_stride;
 2096|  18.8k|            dst += 2 * dst_stride;
 2097|  18.8k|            y -= 2;
 2098|  18.8k|          } while (y);
  ------------------
  |  Branch (2098:20): [True: 15.8k, False: 2.99k]
  ------------------
 2099|  2.99k|        } else if (w == 32) {
  ------------------
  |  Branch (2099:20): [True: 1.30k, False: 991]
  ------------------
 2100|  1.30k|          __m256i s_256[2];
 2101|       |
 2102|  1.30k|          s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
 2103|       |
 2104|  15.4k|          do {
 2105|  15.4k|            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0],
 2106|  15.4k|                              &s_256[1], dst);
 2107|  15.4k|            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1],
 2108|  15.4k|                              &s_256[0], dst + dst_stride);
 2109|  15.4k|            src_ptr += 2 * src_stride;
 2110|  15.4k|            dst += 2 * dst_stride;
 2111|  15.4k|            y -= 2;
 2112|  15.4k|          } while (y);
  ------------------
  |  Branch (2112:20): [True: 14.1k, False: 1.30k]
  ------------------
 2113|  1.30k|        } else if (w == 64) {
  ------------------
  |  Branch (2113:20): [True: 828, False: 163]
  ------------------
 2114|    828|          __m256i s_256[2][2];
 2115|       |
 2116|    828|          s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
 2117|    828|          s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
 2118|       |
 2119|  18.9k|          do {
 2120|  18.9k|            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
 2121|  18.9k|                              &s_256[1][0], dst);
 2122|  18.9k|            sr_y_2tap_32_avx2(src_ptr + src_stride + 32, coeffs_256,
 2123|  18.9k|                              s_256[0][1], &s_256[1][1], dst + 32);
 2124|  18.9k|            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
 2125|  18.9k|                              &s_256[0][0], dst + dst_stride);
 2126|  18.9k|            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 32, coeffs_256,
 2127|  18.9k|                              s_256[1][1], &s_256[0][1], dst + dst_stride + 32);
 2128|       |
 2129|  18.9k|            src_ptr += 2 * src_stride;
 2130|  18.9k|            dst += 2 * dst_stride;
 2131|  18.9k|            y -= 2;
 2132|  18.9k|          } while (y);
  ------------------
  |  Branch (2132:20): [True: 18.1k, False: 828]
  ------------------
 2133|    828|        } else {
 2134|    163|          __m256i s_256[2][4];
 2135|       |
 2136|    163|          assert(w == 128);
 2137|       |
 2138|    163|          s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
 2139|    163|          s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
 2140|    163|          s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
 2141|    163|          s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
 2142|       |
 2143|  7.80k|          do {
 2144|  7.80k|            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
 2145|  7.80k|                              &s_256[1][0], dst);
 2146|  7.80k|            sr_y_2tap_32_avx2(src_ptr + src_stride + 1 * 32, coeffs_256,
 2147|  7.80k|                              s_256[0][1], &s_256[1][1], dst + 1 * 32);
 2148|  7.80k|            sr_y_2tap_32_avx2(src_ptr + src_stride + 2 * 32, coeffs_256,
 2149|  7.80k|                              s_256[0][2], &s_256[1][2], dst + 2 * 32);
 2150|  7.80k|            sr_y_2tap_32_avx2(src_ptr + src_stride + 3 * 32, coeffs_256,
 2151|  7.80k|                              s_256[0][3], &s_256[1][3], dst + 3 * 32);
 2152|       |
 2153|  7.80k|            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
 2154|  7.80k|                              &s_256[0][0], dst + dst_stride);
 2155|  7.80k|            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 1 * 32, coeffs_256,
 2156|  7.80k|                              s_256[1][1], &s_256[0][1],
 2157|  7.80k|                              dst + dst_stride + 1 * 32);
 2158|  7.80k|            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 2 * 32, coeffs_256,
 2159|  7.80k|                              s_256[1][2], &s_256[0][2],
 2160|  7.80k|                              dst + dst_stride + 2 * 32);
 2161|  7.80k|            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 3 * 32, coeffs_256,
 2162|  7.80k|                              s_256[1][3], &s_256[0][3],
 2163|  7.80k|                              dst + dst_stride + 3 * 32);
 2164|       |
 2165|  7.80k|            src_ptr += 2 * src_stride;
 2166|  7.80k|            dst += 2 * dst_stride;
 2167|  7.80k|            y -= 2;
 2168|  7.80k|          } while (y);
  ------------------
  |  Branch (2168:20): [True: 7.64k, False: 163]
  ------------------
 2169|    163|        }
 2170|  5.29k|      }
 2171|  22.8k|    } else {
 2172|       |      // average to get half pel
 2173|  14.4k|      if (w <= 8) {
  ------------------
  |  Branch (2173:11): [True: 10.7k, False: 3.66k]
  ------------------
 2174|  10.7k|        if (w == 2) {
  ------------------
  |  Branch (2174:13): [True: 1.31k, False: 9.47k]
  ------------------
 2175|  1.31k|          __m128i s_16[2];
 2176|       |
 2177|  1.31k|          s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
 2178|       |
 2179|  2.80k|          do {
 2180|  2.80k|            s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + src_stride));
 2181|  2.80k|            const __m128i d0 = _mm_avg_epu8(s_16[0], s_16[1]);
 2182|  2.80k|            *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d0);
 2183|  2.80k|            s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
 2184|  2.80k|            const __m128i d1 = _mm_avg_epu8(s_16[1], s_16[0]);
 2185|  2.80k|            *(int16_t *)(dst + dst_stride) = (int16_t)_mm_cvtsi128_si32(d1);
 2186|  2.80k|            src_ptr += 2 * src_stride;
 2187|  2.80k|            dst += 2 * dst_stride;
 2188|  2.80k|            y -= 2;
 2189|  2.80k|          } while (y);
  ------------------
  |  Branch (2189:20): [True: 1.49k, False: 1.31k]
  ------------------
 2190|  9.47k|        } else if (w == 4) {
  ------------------
  |  Branch (2190:20): [True: 5.18k, False: 4.29k]
  ------------------
 2191|  5.18k|          __m128i s_32[2];
 2192|       |
 2193|  5.18k|          s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
 2194|       |
 2195|  16.5k|          do {
 2196|  16.5k|            s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + src_stride));
 2197|  16.5k|            const __m128i d0 = _mm_avg_epu8(s_32[0], s_32[1]);
 2198|  16.5k|            xx_storel_32(dst, d0);
 2199|  16.5k|            s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
 2200|  16.5k|            const __m128i d1 = _mm_avg_epu8(s_32[1], s_32[0]);
 2201|  16.5k|            xx_storel_32(dst + dst_stride, d1);
 2202|  16.5k|            src_ptr += 2 * src_stride;
 2203|  16.5k|            dst += 2 * dst_stride;
 2204|  16.5k|            y -= 2;
 2205|  16.5k|          } while (y);
  ------------------
  |  Branch (2205:20): [True: 11.3k, False: 5.18k]
  ------------------
 2206|  5.18k|        } else {
 2207|  4.29k|          __m128i s_64[2];
 2208|       |
 2209|  4.29k|          assert(w == 8);
 2210|       |
 2211|  4.29k|          s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
 2212|       |
 2213|  17.4k|          do {
 2214|       |            // Note: Faster than binding to AVX2 registers.
 2215|  17.4k|            s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
 2216|  17.4k|            const __m128i d0 = _mm_avg_epu8(s_64[0], s_64[1]);
 2217|  17.4k|            _mm_storel_epi64((__m128i *)dst, d0);
 2218|  17.4k|            s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
 2219|  17.4k|            const __m128i d1 = _mm_avg_epu8(s_64[1], s_64[0]);
 2220|  17.4k|            _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
 2221|  17.4k|            src_ptr += 2 * src_stride;
 2222|  17.4k|            dst += 2 * dst_stride;
 2223|  17.4k|            y -= 2;
 2224|  17.4k|          } while (y);
  ------------------
  |  Branch (2224:20): [True: 13.1k, False: 4.29k]
  ------------------
 2225|  4.29k|        }
 2226|  10.7k|      } else if (w == 16) {
  ------------------
  |  Branch (2226:18): [True: 2.25k, False: 1.40k]
  ------------------
 2227|  2.25k|        __m128i s_128[2];
 2228|       |
 2229|  2.25k|        s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
 2230|       |
 2231|  13.8k|        do {
 2232|  13.8k|          s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
 2233|  13.8k|          const __m128i d0 = _mm_avg_epu8(s_128[0], s_128[1]);
 2234|  13.8k|          _mm_storeu_si128((__m128i *)dst, d0);
 2235|  13.8k|          s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
 2236|  13.8k|          const __m128i d1 = _mm_avg_epu8(s_128[1], s_128[0]);
 2237|  13.8k|          _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
 2238|  13.8k|          src_ptr += 2 * src_stride;
 2239|  13.8k|          dst += 2 * dst_stride;
 2240|  13.8k|          y -= 2;
 2241|  13.8k|        } while (y);
  ------------------
  |  Branch (2241:18): [True: 11.5k, False: 2.25k]
  ------------------
 2242|  2.25k|      } else if (w == 32) {
  ------------------
  |  Branch (2242:18): [True: 884, False: 522]
  ------------------
 2243|    884|        __m256i s_256[2];
 2244|       |
 2245|    884|        s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
 2246|       |
 2247|  10.6k|        do {
 2248|  10.6k|          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0], &s_256[1], dst);
 2249|  10.6k|          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1], &s_256[0],
 2250|  10.6k|                                dst + dst_stride);
 2251|  10.6k|          src_ptr += 2 * src_stride;
 2252|  10.6k|          dst += 2 * dst_stride;
 2253|  10.6k|          y -= 2;
 2254|  10.6k|        } while (y);
  ------------------
  |  Branch (2254:18): [True: 9.77k, False: 884]
  ------------------
 2255|    884|      } else if (w == 64) {
  ------------------
  |  Branch (2255:18): [True: 344, False: 178]
  ------------------
 2256|    344|        __m256i s_256[2][2];
 2257|       |
 2258|    344|        s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
 2259|    344|        s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
 2260|       |
 2261|  8.46k|        do {
 2262|  8.46k|          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
 2263|  8.46k|                                dst);
 2264|  8.46k|          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 32, s_256[0][1],
 2265|  8.46k|                                &s_256[1][1], dst + 32);
 2266|       |
 2267|  8.46k|          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
 2268|  8.46k|                                &s_256[0][0], dst + dst_stride);
 2269|  8.46k|          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 32, s_256[1][1],
 2270|  8.46k|                                &s_256[0][1], dst + dst_stride + 32);
 2271|       |
 2272|  8.46k|          src_ptr += 2 * src_stride;
 2273|  8.46k|          dst += 2 * dst_stride;
 2274|  8.46k|          y -= 2;
 2275|  8.46k|        } while (y);
  ------------------
  |  Branch (2275:18): [True: 8.12k, False: 344]
  ------------------
 2276|    344|      } else {
 2277|    178|        __m256i s_256[2][4];
 2278|       |
 2279|    178|        assert(w == 128);
 2280|       |
 2281|    179|        s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
 2282|    179|        s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
 2283|    179|        s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
 2284|    179|        s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
 2285|       |
 2286|  8.73k|        do {
 2287|  8.73k|          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
 2288|  8.73k|                                dst);
 2289|  8.73k|          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 1 * 32, s_256[0][1],
 2290|  8.73k|                                &s_256[1][1], dst + 1 * 32);
 2291|  8.73k|          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 2 * 32, s_256[0][2],
 2292|  8.73k|                                &s_256[1][2], dst + 2 * 32);
 2293|  8.73k|          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 3 * 32, s_256[0][3],
 2294|  8.73k|                                &s_256[1][3], dst + 3 * 32);
 2295|       |
 2296|  8.73k|          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
 2297|  8.73k|                                &s_256[0][0], dst + dst_stride);
 2298|  8.73k|          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 1 * 32, s_256[1][1],
 2299|  8.73k|                                &s_256[0][1], dst + dst_stride + 1 * 32);
 2300|  8.73k|          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 2 * 32, s_256[1][2],
 2301|  8.73k|                                &s_256[0][2], dst + dst_stride + 2 * 32);
 2302|  8.73k|          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 3 * 32, s_256[1][3],
 2303|  8.73k|                                &s_256[0][3], dst + dst_stride + 3 * 32);
 2304|       |
 2305|  8.73k|          src_ptr += 2 * src_stride;
 2306|  8.73k|          dst += 2 * dst_stride;
 2307|  8.73k|          y -= 2;
 2308|  8.73k|        } while (y);
  ------------------
  |  Branch (2308:18): [True: 8.55k, False: 179]
  ------------------
 2309|    179|      }
 2310|  14.4k|    }
 2311|   710k|  } else if (vert_tap == 4) {
  ------------------
  |  Branch (2311:14): [True: 340k, False: 370k]
  ------------------
 2312|       |    // vert_filt as 4 tap
 2313|   340k|    const uint8_t *src_ptr = src - src_stride;
 2314|       |
 2315|   340k|    y = h;
 2316|       |
 2317|   340k|    if (w <= 4) {
  ------------------
  |  Branch (2317:9): [True: 170k, False: 169k]
  ------------------
 2318|   170k|      prepare_half_coeffs_4tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
 2319|       |
 2320|   170k|      if (w == 2) {
  ------------------
  |  Branch (2320:11): [True: 32.3k, False: 138k]
  ------------------
 2321|  32.3k|        __m128i s_16[4], ss_128[2];
 2322|       |
 2323|  32.3k|        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
 2324|  32.3k|        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
 2325|  32.3k|        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
 2326|       |
 2327|  32.3k|        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
 2328|  32.3k|        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
 2329|       |
 2330|  32.3k|        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
 2331|       |
 2332|  51.2k|        do {
 2333|  51.2k|          src_ptr += 2 * src_stride;
 2334|  51.2k|          const __m128i res = y_convolve_4tap_2x2_ssse3(
 2335|  51.2k|              src_ptr, src_stride, coeffs_128, s_16, ss_128);
 2336|  51.2k|          const __m128i r = sr_y_round_sse2(res);
 2337|  51.2k|          pack_store_2x2_sse2(r, dst, dst_stride);
 2338|       |
 2339|  51.2k|          ss_128[0] = ss_128[1];
 2340|  51.2k|          dst += 2 * dst_stride;
 2341|  51.2k|          y -= 2;
 2342|  51.2k|        } while (y);
  ------------------
  |  Branch (2342:18): [True: 18.9k, False: 32.3k]
  ------------------
 2343|   138k|      } else {
 2344|   138k|        __m128i s_32[4], ss_128[2];
 2345|       |
 2346|   138k|        assert(w == 4);
 2347|       |
 2348|   138k|        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
 2349|   138k|        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
 2350|   138k|        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
 2351|       |
 2352|   138k|        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
 2353|   138k|        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
 2354|       |
 2355|   138k|        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
 2356|       |
 2357|   281k|        do {
 2358|   281k|          src_ptr += 2 * src_stride;
 2359|   281k|          const __m128i res = y_convolve_4tap_4x2_ssse3(
 2360|   281k|              src_ptr, src_stride, coeffs_128, s_32, ss_128);
 2361|   281k|          const __m128i r = sr_y_round_sse2(res);
 2362|   281k|          pack_store_4x2_sse2(r, dst, dst_stride);
 2363|       |
 2364|   281k|          ss_128[0] = ss_128[1];
 2365|   281k|          dst += 2 * dst_stride;
 2366|   281k|          y -= 2;
 2367|   281k|        } while (y);
  ------------------
  |  Branch (2367:18): [True: 142k, False: 138k]
  ------------------
 2368|   138k|      }
 2369|   170k|    } else {
 2370|   169k|      prepare_half_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
 2371|       |
 2372|   169k|      if (w == 8) {
  ------------------
  |  Branch (2372:11): [True: 110k, False: 59.2k]
  ------------------
 2373|   110k|        __m128i s_64[4];
 2374|   110k|        __m256i ss_256[2];
 2375|       |
 2376|   110k|        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
 2377|   110k|        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
 2378|   110k|        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
 2379|       |
 2380|       |        // Load lines a and b. Line a to lower 128, line b to upper 128
 2381|   110k|        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
  ------------------
  |  |   29|   110k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|   110k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2382|   110k|        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
  ------------------
  |  |   29|   110k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|   110k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2383|       |
 2384|   110k|        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
 2385|       |
 2386|   228k|        do {
 2387|   228k|          src_ptr += 2 * src_stride;
 2388|   228k|          const __m256i res = y_convolve_4tap_8x2_avx2(
 2389|   228k|              src_ptr, src_stride, coeffs_256, s_64, ss_256);
 2390|   228k|          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
 2391|       |
 2392|   228k|          ss_256[0] = ss_256[1];
 2393|   228k|          dst += 2 * dst_stride;
 2394|   228k|          y -= 2;
 2395|   228k|        } while (y);
  ------------------
  |  Branch (2395:18): [True: 118k, False: 110k]
  ------------------
 2396|   110k|      } else if (w == 16) {
  ------------------
  |  Branch (2396:18): [True: 52.9k, False: 6.32k]
  ------------------
 2397|  52.9k|        __m128i s_128[4];
 2398|  52.9k|        __m256i ss_256[4], r[2];
 2399|       |
 2400|  52.9k|        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
 2401|  52.9k|        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
 2402|  52.9k|        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
 2403|       |
 2404|       |        // Load lines a and b. Line a to lower 128, line b to upper 128
 2405|  52.9k|        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
  ------------------
  |  |   29|  52.9k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  52.9k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2406|  52.9k|        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
  ------------------
  |  |   29|  52.9k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  52.9k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2407|       |
 2408|  52.9k|        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
 2409|  52.9k|        ss_256[2] = _mm256_unpackhi_epi8(src01, src12);
 2410|       |
 2411|   140k|        do {
 2412|   140k|          src_ptr += 2 * src_stride;
 2413|   140k|          y_convolve_4tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
 2414|   140k|                                    ss_256, r);
 2415|   140k|          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
 2416|       |
 2417|   140k|          ss_256[0] = ss_256[1];
 2418|   140k|          ss_256[2] = ss_256[3];
 2419|   140k|          dst += 2 * dst_stride;
 2420|   140k|          y -= 2;
 2421|   140k|        } while (y);
  ------------------
  |  Branch (2421:18): [True: 87.8k, False: 52.9k]
  ------------------
 2422|  52.9k|      } else if (w == 32) {
  ------------------
  |  Branch (2422:18): [True: 4.84k, False: 1.47k]
  ------------------
 2423|       |        // AV1 standard won't have 32x4 case.
 2424|       |        // This only favors some optimization feature which
 2425|       |        // subsamples 32x8 to 32x4 and triggers 4-tap filter.
 2426|       |
 2427|  4.84k|        __m256i s_256[4], ss_256[4], tt_256[4], r[4];
 2428|       |
 2429|  4.84k|        s_256[0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * src_stride));
 2430|  4.84k|        s_256[1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * src_stride));
 2431|  4.84k|        s_256[2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * src_stride));
 2432|       |
 2433|  4.84k|        ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
 2434|  4.84k|        ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
 2435|       |
 2436|  4.84k|        tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
 2437|  4.84k|        tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
 2438|       |
 2439|  38.8k|        do {
 2440|  38.8k|          src_ptr += 2 * src_stride;
 2441|  38.8k|          y_convolve_4tap_32x2_avx2(src_ptr, src_stride, coeffs_256, s_256,
 2442|  38.8k|                                    ss_256, tt_256, r);
 2443|  38.8k|          sr_y_round_store_32x2_avx2(r, dst, dst_stride);
 2444|       |
 2445|  38.8k|          ss_256[0] = ss_256[1];
 2446|  38.8k|          ss_256[2] = ss_256[3];
 2447|       |
 2448|  38.8k|          tt_256[0] = tt_256[1];
 2449|  38.8k|          tt_256[2] = tt_256[3];
 2450|  38.8k|          dst += 2 * dst_stride;
 2451|  38.8k|          y -= 2;
 2452|  38.8k|        } while (y);
  ------------------
  |  Branch (2452:18): [True: 33.9k, False: 4.84k]
  ------------------
 2453|  4.84k|      } else {
 2454|  1.47k|        assert(!(w % 32));
 2455|       |
 2456|  1.48k|        __m256i s_256[4], ss_256[4], tt_256[4], r[4];
 2457|  1.48k|        x = 0;
 2458|  3.54k|        do {
 2459|  3.54k|          const uint8_t *s = src_ptr + x;
 2460|  3.54k|          uint8_t *d = dst + x;
 2461|  3.54k|          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
 2462|  3.54k|          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
 2463|  3.54k|          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
 2464|       |
 2465|  3.54k|          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
 2466|  3.54k|          ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
 2467|       |
 2468|  3.54k|          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
 2469|  3.54k|          tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
 2470|       |
 2471|  3.54k|          y = h;
 2472|   134k|          do {
 2473|   134k|            s += 2 * src_stride;
 2474|   134k|            y_convolve_4tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
 2475|   134k|                                      tt_256, r);
 2476|   134k|            sr_y_round_store_32x2_avx2(r, d, dst_stride);
 2477|       |
 2478|   134k|            ss_256[0] = ss_256[1];
 2479|   134k|            ss_256[2] = ss_256[3];
 2480|       |
 2481|   134k|            tt_256[0] = tt_256[1];
 2482|   134k|            tt_256[2] = tt_256[3];
 2483|   134k|            d += 2 * dst_stride;
 2484|   134k|            y -= 2;
 2485|   134k|          } while (y);
  ------------------
  |  Branch (2485:20): [True: 131k, False: 3.54k]
  ------------------
 2486|  3.54k|          x += 32;
 2487|  3.54k|        } while (x < w);
  ------------------
  |  Branch (2487:18): [True: 2.06k, False: 1.48k]
  ------------------
 2488|  1.48k|      }
 2489|   169k|    }
 2490|   370k|  } else if (vert_tap == 6) {
  ------------------
  |  Branch (2490:14): [True: 346k, False: 23.2k]
  ------------------
 2491|       |    // vert_filt as 6 tap
 2492|   346k|    const uint8_t *src_ptr = src - 2 * src_stride;
 2493|       |
 2494|   346k|    if (w <= 4) {
  ------------------
  |  Branch (2494:9): [True: 109k, False: 237k]
  ------------------
 2495|   109k|      prepare_half_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
 2496|       |
 2497|   109k|      y = h;
 2498|       |
 2499|   109k|      if (w == 2) {
  ------------------
  |  Branch (2499:11): [True: 18.6k, False: 90.9k]
  ------------------
 2500|  18.6k|        __m128i s_16[6], ss_128[3];
 2501|       |
 2502|  18.6k|        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
 2503|  18.6k|        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
 2504|  18.6k|        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
 2505|  18.6k|        s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
 2506|  18.6k|        s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
 2507|       |
 2508|  18.6k|        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
 2509|  18.6k|        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
 2510|  18.6k|        const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
 2511|  18.6k|        const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
 2512|       |
 2513|  18.6k|        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
 2514|  18.6k|        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
 2515|       |
 2516|  74.4k|        do {
 2517|  74.4k|          src_ptr += 2 * src_stride;
 2518|  74.4k|          const __m128i res = y_convolve_6tap_2x2_ssse3(
 2519|  74.4k|              src_ptr, src_stride, coeffs_128, s_16, ss_128);
 2520|  74.4k|          const __m128i r = sr_y_round_sse2(res);
 2521|  74.4k|          pack_store_2x2_sse2(r, dst, dst_stride);
 2522|       |
 2523|  74.4k|          ss_128[0] = ss_128[1];
 2524|  74.4k|          ss_128[1] = ss_128[2];
 2525|  74.4k|          dst += 2 * dst_stride;
 2526|  74.4k|          y -= 2;
 2527|  74.4k|        } while (y);
  ------------------
  |  Branch (2527:18): [True: 55.8k, False: 18.6k]
  ------------------
 2528|  90.9k|      } else {
 2529|  90.9k|        __m128i s_32[6], ss_128[3];
 2530|       |
 2531|  90.9k|        assert(w == 4);
 2532|       |
 2533|  90.9k|        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
 2534|  90.9k|        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
 2535|  90.9k|        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
 2536|  90.9k|        s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
 2537|  90.9k|        s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
 2538|       |
 2539|  90.9k|        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
 2540|  90.9k|        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
 2541|  90.9k|        const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
 2542|  90.9k|        const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
 2543|       |
 2544|  90.9k|        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
 2545|  90.9k|        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
 2546|       |
 2547|   493k|        do {
 2548|   493k|          src_ptr += 2 * src_stride;
 2549|   493k|          const __m128i res = y_convolve_6tap_4x2_ssse3(
 2550|   493k|              src_ptr, src_stride, coeffs_128, s_32, ss_128);
 2551|   493k|          const __m128i r = sr_y_round_sse2(res);
 2552|   493k|          pack_store_4x2_sse2(r, dst, dst_stride);
 2553|       |
 2554|   493k|          ss_128[0] = ss_128[1];
 2555|   493k|          ss_128[1] = ss_128[2];
 2556|   493k|          dst += 2 * dst_stride;
 2557|   493k|          y -= 2;
 2558|   493k|        } while (y);
  ------------------
  |  Branch (2558:18): [True: 402k, False: 90.9k]
  ------------------
 2559|  90.9k|      }
 2560|   237k|    } else {
 2561|   237k|      prepare_half_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
 2562|       |
 2563|   237k|      if (w == 8) {
  ------------------
  |  Branch (2563:11): [True: 120k, False: 117k]
  ------------------
 2564|   120k|        __m128i s_64[6];
 2565|   120k|        __m256i ss_256[3];
 2566|       |
 2567|   120k|        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
 2568|   120k|        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
 2569|   120k|        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
 2570|   120k|        s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
 2571|   120k|        s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
 2572|       |
 2573|       |        // Load lines a and b. Line a to lower 128, line b to upper 128
 2574|   120k|        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
  ------------------
  |  |   29|   120k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|   120k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2575|   120k|        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
  ------------------
  |  |   29|   120k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|   120k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2576|   120k|        const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
  ------------------
  |  |   29|   120k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|   120k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2577|   120k|        const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
  ------------------
  |  |   29|   120k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|   120k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2578|       |
 2579|   120k|        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
 2580|   120k|        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
 2581|       |
 2582|   120k|        y = h;
 2583|   690k|        do {
 2584|   690k|          src_ptr += 2 * src_stride;
 2585|   690k|          const __m256i res = y_convolve_6tap_8x2_avx2(
 2586|   690k|              src_ptr, src_stride, coeffs_256, s_64, ss_256);
 2587|   690k|          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
 2588|       |
 2589|   690k|          ss_256[0] = ss_256[1];
 2590|   690k|          ss_256[1] = ss_256[2];
 2591|   690k|          dst += 2 * dst_stride;
 2592|   690k|          y -= 2;
 2593|   690k|        } while (y);
  ------------------
  |  Branch (2593:18): [True: 569k, False: 120k]
  ------------------
 2594|   120k|      } else if (w == 16) {
  ------------------
  |  Branch (2594:18): [True: 81.7k, False: 35.4k]
  ------------------
 2595|  81.7k|        __m128i s_128[6];
 2596|  81.7k|        __m256i ss_256[6], r[2];
 2597|       |
 2598|  81.7k|        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
 2599|  81.7k|        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
 2600|  81.7k|        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
 2601|  81.7k|        s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
 2602|  81.7k|        s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
 2603|       |
 2604|       |        // Load lines a and b. Line a to lower 128, line b to upper 128
 2605|  81.7k|        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
  ------------------
  |  |   29|  81.7k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  81.7k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2606|  81.7k|        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
  ------------------
  |  |   29|  81.7k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  81.7k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2607|  81.7k|        const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
  ------------------
  |  |   29|  81.7k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  81.7k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2608|  81.7k|        const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
  ------------------
  |  |   29|  81.7k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  81.7k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2609|       |
 2610|  81.7k|        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
 2611|  81.7k|        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
 2612|       |
 2613|  81.7k|        ss_256[3] = _mm256_unpackhi_epi8(src01, src12);
 2614|  81.7k|        ss_256[4] = _mm256_unpackhi_epi8(src23, src34);
 2615|       |
 2616|  81.7k|        y = h;
 2617|   602k|        do {
 2618|   602k|          src_ptr += 2 * src_stride;
 2619|   602k|          y_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
 2620|   602k|                                    ss_256, r);
 2621|   602k|          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
 2622|       |
 2623|   602k|          ss_256[0] = ss_256[1];
 2624|   602k|          ss_256[1] = ss_256[2];
 2625|       |
 2626|   602k|          ss_256[3] = ss_256[4];
 2627|   602k|          ss_256[4] = ss_256[5];
 2628|   602k|          dst += 2 * dst_stride;
 2629|   602k|          y -= 2;
 2630|   602k|        } while (y);
  ------------------
  |  Branch (2630:18): [True: 521k, False: 81.7k]
  ------------------
 2631|  81.7k|      } else {
 2632|  35.4k|        __m256i s_256[6], ss_256[6], tt_256[6], r[4];
 2633|       |
 2634|  35.4k|        assert(!(w % 32));
 2635|       |
 2636|  35.4k|        x = 0;
 2637|  42.0k|        do {
 2638|  42.0k|          const uint8_t *s = src_ptr + x;
 2639|  42.0k|          uint8_t *d = dst + x;
 2640|       |
 2641|  42.0k|          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
 2642|  42.0k|          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
 2643|  42.0k|          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
 2644|  42.0k|          s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
 2645|  42.0k|          s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
 2646|       |
 2647|  42.0k|          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
 2648|  42.0k|          ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
 2649|  42.0k|          ss_256[3] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
 2650|  42.0k|          ss_256[4] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
 2651|       |
 2652|  42.0k|          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
 2653|  42.0k|          tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
 2654|  42.0k|          tt_256[3] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
 2655|  42.0k|          tt_256[4] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
 2656|       |
 2657|  42.0k|          y = h;
 2658|   714k|          do {
 2659|   714k|            s += 2 * src_stride;
 2660|   714k|            y_convolve_6tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
 2661|   714k|                                      tt_256, r);
 2662|   714k|            sr_y_round_store_32x2_avx2(r, d, dst_stride);
 2663|       |
 2664|   714k|            ss_256[0] = ss_256[1];
 2665|   714k|            ss_256[1] = ss_256[2];
 2666|   714k|            ss_256[3] = ss_256[4];
 2667|   714k|            ss_256[4] = ss_256[5];
 2668|       |
 2669|   714k|            tt_256[0] = tt_256[1];
 2670|   714k|            tt_256[1] = tt_256[2];
 2671|   714k|            tt_256[3] = tt_256[4];
 2672|   714k|            tt_256[4] = tt_256[5];
 2673|   714k|            d += 2 * dst_stride;
 2674|   714k|            y -= 2;
 2675|   714k|          } while (y);
  ------------------
  |  Branch (2675:20): [True: 672k, False: 42.0k]
  ------------------
 2676|       |
 2677|  42.0k|          x += 32;
 2678|  42.0k|        } while (x < w);
  ------------------
  |  Branch (2678:18): [True: 6.58k, False: 35.4k]
  ------------------
 2679|  35.4k|      }
 2680|   237k|    }
 2681|   346k|  } else if (vert_tap == 8) {
  ------------------
  |  Branch (2681:14): [True: 23.2k, False: 18.4E]
  ------------------
 2682|       |    // vert_filt as 8 tap
 2683|  23.2k|    const uint8_t *src_ptr = src - 3 * src_stride;
 2684|       |
 2685|  23.2k|    if (w <= 4) {
  ------------------
  |  Branch (2685:9): [True: 7.32k, False: 15.9k]
  ------------------
 2686|  7.32k|      prepare_half_coeffs_8tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
 2687|       |
 2688|  7.32k|      y = h;
 2689|       |
 2690|  7.32k|      if (w == 2) {
  ------------------
  |  Branch (2690:11): [True: 1.37k, False: 5.94k]
  ------------------
 2691|  1.37k|        __m128i s_16[8], ss_128[4];
 2692|       |
 2693|  1.37k|        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
 2694|  1.37k|        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
 2695|  1.37k|        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
 2696|  1.37k|        s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
 2697|  1.37k|        s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
 2698|  1.37k|        s_16[5] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 5 * src_stride));
 2699|  1.37k|        s_16[6] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 6 * src_stride));
 2700|       |
 2701|  1.37k|        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
 2702|  1.37k|        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
 2703|  1.37k|        const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
 2704|  1.37k|        const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
 2705|  1.37k|        const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
 2706|  1.37k|        const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[6]);
 2707|       |
 2708|  1.37k|        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
 2709|  1.37k|        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
 2710|  1.37k|        ss_128[2] = _mm_unpacklo_epi8(src45, src56);
 2711|       |
 2712|  5.50k|        do {
 2713|  5.50k|          const __m128i res = y_convolve_8tap_2x2_ssse3(
 2714|  5.50k|              src_ptr, src_stride, coeffs_128, s_16, ss_128);
 2715|  5.50k|          const __m128i r = sr_y_round_sse2(res);
 2716|  5.50k|          pack_store_2x2_sse2(r, dst, dst_stride);
 2717|  5.50k|          ss_128[0] = ss_128[1];
 2718|  5.50k|          ss_128[1] = ss_128[2];
 2719|  5.50k|          ss_128[2] = ss_128[3];
 2720|  5.50k|          src_ptr += 2 * src_stride;
 2721|  5.50k|          dst += 2 * dst_stride;
 2722|  5.50k|          y -= 2;
 2723|  5.50k|        } while (y);
  ------------------
  |  Branch (2723:18): [True: 4.12k, False: 1.37k]
  ------------------
 2724|  5.94k|      } else {
 2725|  5.94k|        __m128i s_32[8], ss_128[4];
 2726|       |
 2727|  5.94k|        assert(w == 4);
 2728|       |
 2729|  5.94k|        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
 2730|  5.94k|        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
 2731|  5.94k|        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
 2732|  5.94k|        s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
 2733|  5.94k|        s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
 2734|  5.94k|        s_32[5] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 5 * src_stride));
 2735|  5.94k|        s_32[6] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 6 * src_stride));
 2736|       |
 2737|  5.94k|        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
 2738|  5.94k|        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
 2739|  5.94k|        const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
 2740|  5.94k|        const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
 2741|  5.94k|        const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
 2742|  5.94k|        const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
 2743|       |
 2744|  5.94k|        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
 2745|  5.94k|        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
 2746|  5.94k|        ss_128[2] = _mm_unpacklo_epi8(src45, src56);
 2747|       |
 2748|  32.7k|        do {
 2749|  32.7k|          const __m128i res = y_convolve_8tap_4x2_ssse3(
 2750|  32.7k|              src_ptr, src_stride, coeffs_128, s_32, ss_128);
 2751|  32.7k|          const __m128i r = sr_y_round_sse2(res);
 2752|  32.7k|          pack_store_4x2_sse2(r, dst, dst_stride);
 2753|  32.7k|          ss_128[0] = ss_128[1];
 2754|  32.7k|          ss_128[1] = ss_128[2];
 2755|  32.7k|          ss_128[2] = ss_128[3];
 2756|  32.7k|          src_ptr += 2 * src_stride;
 2757|  32.7k|          dst += 2 * dst_stride;
 2758|  32.7k|          y -= 2;
 2759|  32.7k|        } while (y);
  ------------------
  |  Branch (2759:18): [True: 26.8k, False: 5.94k]
  ------------------
 2760|  5.94k|      }
 2761|  15.9k|    } else {
 2762|  15.9k|      prepare_half_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
 2763|       |
 2764|  15.9k|      if (w == 8) {
  ------------------
  |  Branch (2764:11): [True: 7.72k, False: 8.18k]
  ------------------
 2765|  7.72k|        __m128i s_64[8];
 2766|  7.72k|        __m256i ss_256[4];
 2767|       |
 2768|  7.72k|        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
 2769|  7.72k|        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
 2770|  7.72k|        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
 2771|  7.72k|        s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
 2772|  7.72k|        s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
 2773|  7.72k|        s_64[5] = _mm_loadl_epi64((__m128i *)(src_ptr + 5 * src_stride));
 2774|  7.72k|        s_64[6] = _mm_loadl_epi64((__m128i *)(src_ptr + 6 * src_stride));
 2775|       |
 2776|       |        // Load lines a and b. Line a to lower 128, line b to upper 128
 2777|  7.72k|        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
  ------------------
  |  |   29|  7.72k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  7.72k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2778|  7.72k|        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
  ------------------
  |  |   29|  7.72k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  7.72k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2779|  7.72k|        const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
  ------------------
  |  |   29|  7.72k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  7.72k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2780|  7.72k|        const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
  ------------------
  |  |   29|  7.72k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  7.72k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2781|  7.72k|        const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
  ------------------
  |  |   29|  7.72k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  7.72k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2782|  7.72k|        const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[6]);
  ------------------
  |  |   29|  7.72k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  7.72k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2783|       |
 2784|  7.72k|        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
 2785|  7.72k|        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
 2786|  7.72k|        ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
 2787|       |
 2788|  7.72k|        y = h;
 2789|  47.0k|        do {
 2790|  47.0k|          const __m256i res = y_convolve_8tap_8x2_avx2(
 2791|  47.0k|              src_ptr, src_stride, coeffs_256, s_64, ss_256);
 2792|  47.0k|          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
 2793|  47.0k|          ss_256[0] = ss_256[1];
 2794|  47.0k|          ss_256[1] = ss_256[2];
 2795|  47.0k|          ss_256[2] = ss_256[3];
 2796|  47.0k|          src_ptr += 2 * src_stride;
 2797|  47.0k|          dst += 2 * dst_stride;
 2798|  47.0k|          y -= 2;
 2799|  47.0k|        } while (y);
  ------------------
  |  Branch (2799:18): [True: 39.3k, False: 7.72k]
  ------------------
 2800|  8.18k|      } else if (w == 16) {
  ------------------
  |  Branch (2800:18): [True: 4.94k, False: 3.24k]
  ------------------
 2801|  4.94k|        __m128i s_128[8];
 2802|  4.94k|        __m256i ss_256[8], r[2];
 2803|       |
 2804|  4.94k|        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
 2805|  4.94k|        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
 2806|  4.94k|        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
 2807|  4.94k|        s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
 2808|  4.94k|        s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
 2809|  4.94k|        s_128[5] = _mm_loadu_si128((__m128i *)(src_ptr + 5 * src_stride));
 2810|  4.94k|        s_128[6] = _mm_loadu_si128((__m128i *)(src_ptr + 6 * src_stride));
 2811|       |
 2812|       |        // Load lines a and b. Line a to lower 128, line b to upper 128
 2813|  4.94k|        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
  ------------------
  |  |   29|  4.94k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  4.94k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2814|  4.94k|        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
  ------------------
  |  |   29|  4.94k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  4.94k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2815|  4.94k|        const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
  ------------------
  |  |   29|  4.94k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  4.94k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2816|  4.94k|        const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
  ------------------
  |  |   29|  4.94k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  4.94k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2817|  4.94k|        const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
  ------------------
  |  |   29|  4.94k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  4.94k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2818|  4.94k|        const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[6]);
  ------------------
  |  |   29|  4.94k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  4.94k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 2819|       |
 2820|  4.94k|        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
 2821|  4.94k|        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
 2822|  4.94k|        ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
 2823|       |
 2824|  4.94k|        ss_256[4] = _mm256_unpackhi_epi8(src01, src12);
 2825|  4.94k|        ss_256[5] = _mm256_unpackhi_epi8(src23, src34);
 2826|  4.94k|        ss_256[6] = _mm256_unpackhi_epi8(src45, src56);
 2827|       |
 2828|  4.94k|        y = h;
 2829|  41.2k|        do {
 2830|  41.2k|          y_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
 2831|  41.2k|                                    ss_256, r);
 2832|  41.2k|          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
 2833|       |
 2834|  41.2k|          ss_256[0] = ss_256[1];
 2835|  41.2k|          ss_256[1] = ss_256[2];
 2836|  41.2k|          ss_256[2] = ss_256[3];
 2837|       |
 2838|  41.2k|          ss_256[4] = ss_256[5];
 2839|  41.2k|          ss_256[5] = ss_256[6];
 2840|  41.2k|          ss_256[6] = ss_256[7];
 2841|  41.2k|          src_ptr += 2 * src_stride;
 2842|  41.2k|          dst += 2 * dst_stride;
 2843|  41.2k|          y -= 2;
 2844|  41.2k|        } while (y);
  ------------------
  |  Branch (2844:18): [True: 36.2k, False: 4.94k]
  ------------------
 2845|  4.94k|      } else {
 2846|  3.24k|        __m256i s_256[8], ss_256[8], tt_256[8], r[4];
 2847|       |
 2848|  3.24k|        assert(!(w % 32));
 2849|       |
 2850|  3.24k|        x = 0;
 2851|  4.52k|        do {
 2852|  4.52k|          const uint8_t *s = src_ptr + x;
 2853|  4.52k|          uint8_t *d = dst + x;
 2854|       |
 2855|  4.52k|          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
 2856|  4.52k|          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
 2857|  4.52k|          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
 2858|  4.52k|          s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
 2859|  4.52k|          s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
 2860|  4.52k|          s_256[5] = _mm256_loadu_si256((__m256i *)(s + 5 * src_stride));
 2861|  4.52k|          s_256[6] = _mm256_loadu_si256((__m256i *)(s + 6 * src_stride));
 2862|       |
 2863|  4.52k|          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
 2864|  4.52k|          ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
 2865|  4.52k|          ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
 2866|  4.52k|          ss_256[4] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
 2867|  4.52k|          ss_256[5] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
 2868|  4.52k|          ss_256[6] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
 2869|       |
 2870|  4.52k|          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
 2871|  4.52k|          tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
 2872|  4.52k|          tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[6]);
 2873|  4.52k|          tt_256[4] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
 2874|  4.52k|          tt_256[5] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
 2875|  4.52k|          tt_256[6] = _mm256_unpackhi_epi8(s_256[5], s_256[6]);
 2876|       |
 2877|  4.52k|          y = h;
 2878|   107k|          do {
 2879|   107k|            y_convolve_8tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
 2880|   107k|                                      tt_256, r);
 2881|   107k|            sr_y_round_store_32x2_avx2(r, d, dst_stride);
 2882|       |
 2883|   107k|            ss_256[0] = ss_256[1];
 2884|   107k|            ss_256[1] = ss_256[2];
 2885|   107k|            ss_256[2] = ss_256[3];
 2886|   107k|            ss_256[4] = ss_256[5];
 2887|   107k|            ss_256[5] = ss_256[6];
 2888|   107k|            ss_256[6] = ss_256[7];
 2889|       |
 2890|   107k|            tt_256[0] = tt_256[1];
 2891|   107k|            tt_256[1] = tt_256[2];
 2892|   107k|            tt_256[2] = tt_256[3];
 2893|   107k|            tt_256[4] = tt_256[5];
 2894|   107k|            tt_256[5] = tt_256[6];
 2895|   107k|            tt_256[6] = tt_256[7];
 2896|   107k|            s += 2 * src_stride;
 2897|   107k|            d += 2 * dst_stride;
 2898|   107k|            y -= 2;
 2899|   107k|          } while (y);
  ------------------
  |  Branch (2899:20): [True: 103k, False: 4.52k]
  ------------------
 2900|       |
 2901|  4.52k|          x += 32;
 2902|  4.52k|        } while (x < w);
  ------------------
  |  Branch (2902:18): [True: 1.28k, False: 3.24k]
  ------------------
 2903|  3.24k|      }
 2904|  15.9k|    }
 2905|  23.2k|  }
 2906|   747k|}
convolve_avx2.c:prepare_half_coeffs_2tap_ssse3:
   61|  38.6k|    __m128i *const coeffs /* [1] */) {
   62|  38.6k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
   63|  38.6k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  38.6k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  38.6k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
   64|  38.6k|  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
   65|       |
   66|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
   67|       |  // This extra right shift will be taken care of at the end while rounding
   68|       |  // the result.
   69|       |  // Since all filter co-efficients are even, this change will not affect the
   70|       |  // end result
   71|  38.6k|  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
   72|  38.6k|                            _mm_set1_epi16((short)0xffff)));
   73|       |
   74|  38.6k|  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
   75|       |
   76|       |  // coeffs 3 4 3 4 3 4 3 4
   77|  38.6k|  *coeffs = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
   78|  38.6k|}
convolve_avx2.c:y_convolve_2tap_2x2_ssse3:
 1087|  5.59k|                                                __m128i s_16[2]) {
 1088|  5.59k|  __m128i s_128[2];
 1089|       |
 1090|  5.59k|  s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src + stride));
 1091|  5.59k|  s_128[0] = _mm_unpacklo_epi16(s_16[0], s_16[1]);
 1092|  5.59k|  s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src + 2 * stride));
 1093|  5.59k|  s_128[1] = _mm_unpacklo_epi16(s_16[1], s_16[0]);
 1094|  5.59k|  const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
 1095|  5.59k|  return convolve_2tap_ssse3(&ss, coeffs);
 1096|  5.59k|}
convolve_avx2.c:sr_y_round_sse2:
  792|  1.01M|static inline __m128i sr_y_round_sse2(const __m128i src) {
  793|  1.01M|  const __m128i round = _mm_set1_epi16(32);
  794|  1.01M|  const __m128i dst = _mm_add_epi16(src, round);
  795|  1.01M|  return _mm_srai_epi16(dst, FILTER_BITS - 1);
  ------------------
  |  |   21|  1.01M|#define FILTER_BITS 7
  ------------------
  796|  1.01M|}
convolve_avx2.c:pack_store_2x2_sse2:
  687|   316k|                                       const ptrdiff_t stride) {
  688|   316k|  const __m128i d = _mm_packus_epi16(res, res);
  689|   316k|  *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d);
  690|   316k|  *(int16_t *)(dst + stride) = (int16_t)_mm_extract_epi16(d, 1);
  691|   316k|}
convolve_avx2.c:y_convolve_2tap_4x2_ssse3:
 1101|  26.0k|                                                __m128i s_32[2]) {
 1102|  26.0k|  __m128i s_128[2];
 1103|       |
 1104|  26.0k|  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + stride));
 1105|  26.0k|  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
 1106|  26.0k|  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
 1107|  26.0k|  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
 1108|  26.0k|  const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
 1109|  26.0k|  return convolve_2tap_ssse3(&ss, coeffs);
 1110|  26.0k|}
convolve_avx2.c:pack_store_4x2_sse2:
  694|  1.75M|                                       const ptrdiff_t stride) {
  695|  1.75M|  const __m128i d = _mm_packus_epi16(res, res);
  696|  1.75M|  store_u8_4x2_sse2(d, dst, stride);
  697|  1.75M|}
convolve_avx2.c:convolve_2tap_ssse3:
  433|   171k|                                          const __m128i coeffs[1]) {
  434|   171k|  return _mm_maddubs_epi16(ss[0], coeffs[0]);
  435|   171k|}
convolve_avx2.c:prepare_half_coeffs_2tap_avx2:
  157|  12.4k|    __m256i *const coeffs /* [1] */) {
  158|  12.4k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  159|  12.4k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  12.4k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  12.4k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  160|  12.4k|  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
  161|  12.4k|  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
  162|       |
  163|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
  164|       |  // This extra right shift will be taken care of at the end while rounding
  165|       |  // the result.
  166|       |  // Since all filter co-efficients are even, this change will not affect the
  167|       |  // end result
  168|  12.4k|  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
  169|  12.4k|                            _mm_set1_epi16((short)0xffff)));
  170|       |
  171|  12.4k|  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
  172|       |
  173|       |  // coeffs 3 4 3 4 3 4 3 4
  174|  12.4k|  *coeffs = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
  175|  12.4k|}
convolve_avx2.c:y_convolve_2tap_16x2_avx2:
 1129|  18.8k|                                             __m128i s_128[2], __m256i r[2]) {
 1130|  18.8k|  __m256i s_256[2];
 1131|       |
 1132|  18.8k|  s_128[1] = _mm_loadu_si128((__m128i *)(src + stride));
 1133|  18.8k|  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
  ------------------
  |  |   29|  18.8k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  18.8k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1134|  18.8k|  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
 1135|  18.8k|  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
  ------------------
  |  |   29|  18.8k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  18.8k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1136|  18.8k|  const __m256i ss0 = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
 1137|  18.8k|  const __m256i ss1 = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
 1138|  18.8k|  r[0] = convolve_2tap_avx2(&ss0, coeffs);
 1139|  18.8k|  r[1] = convolve_2tap_avx2(&ss1, coeffs);
 1140|  18.8k|}
convolve_avx2.c:convolve_2tap_avx2:
  465|   890k|                                         const __m256i coeffs[1]) {
  466|   890k|  return _mm256_maddubs_epi16(ss[0], coeffs[0]);
  467|   890k|}
convolve_avx2.c:sr_y_round_store_16x2_avx2:
  833|   803k|                                              const ptrdiff_t dst_stride) {
  834|   803k|  __m256i r[2];
  835|       |
  836|   803k|  r[0] = sr_y_round_avx2(res[0]);
  837|   803k|  r[1] = sr_y_round_avx2(res[1]);
  838|   803k|  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
  839|   803k|}
convolve_avx2.c:sr_y_round_avx2:
  596|  6.89M|static inline __m256i sr_y_round_avx2(const __m256i src) {
  597|  6.89M|  const __m256i round = _mm256_set1_epi16(32);
  598|  6.89M|  const __m256i dst = _mm256_add_epi16(src, round);
  599|  6.89M|  return _mm256_srai_epi16(dst, FILTER_BITS - 1);
  ------------------
  |  |   21|  6.89M|#define FILTER_BITS 7
  ------------------
  600|  6.89M|}
convolve_avx2.c:pack_store_16x2_avx2:
  720|  1.64M|                                        const ptrdiff_t stride) {
  721|  1.64M|  const __m256i d = _mm256_packus_epi16(res0, res1);
  722|  1.64M|  storeu_u8_16x2_avx2(d, dst, stride);
  723|  1.64M|}
convolve_avx2.c:sr_y_2tap_32_avx2:
 1999|   169k|                                     __m256i *const s1, uint8_t *const dst) {
 2000|   169k|  __m256i r[2];
 2001|   169k|  y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
 2002|   169k|  sr_y_round_store_32_avx2(r, dst);
 2003|   169k|}
convolve_avx2.c:y_convolve_2tap_32_avx2:
 1145|   169k|                                           __m256i r[2]) {
 1146|   169k|  *s1 = _mm256_loadu_si256((__m256i *)src);
 1147|   169k|  const __m256i ss0 = _mm256_unpacklo_epi8(s0, *s1);
 1148|   169k|  const __m256i ss1 = _mm256_unpackhi_epi8(s0, *s1);
 1149|   169k|  r[0] = convolve_2tap_avx2(&ss0, coeffs);
 1150|   169k|  r[1] = convolve_2tap_avx2(&ss1, coeffs);
 1151|   169k|}
convolve_avx2.c:sr_y_round_store_32_avx2:
 1982|  2.15M|                                            uint8_t *const dst) {
 1983|  2.15M|  __m256i r[2];
 1984|       |
 1985|  2.15M|  r[0] = sr_y_round_avx2(res[0]);
 1986|  2.15M|  r[1] = sr_y_round_avx2(res[1]);
 1987|  2.15M|  convolve_store_32_avx2(r[0], r[1], dst);
 1988|  2.15M|}
convolve_avx2.c:convolve_store_32_avx2:
  775|  4.34M|                                          uint8_t *const dst) {
  776|  4.34M|  const __m256i d = _mm256_packus_epi16(res0, res1);
  777|  4.34M|  _mm256_storeu_si256((__m256i *)dst, d);
  778|  4.34M|}
convolve_avx2.c:sr_y_2tap_32_avg_avx2:
  843|   125k|                                         uint8_t *const dst) {
  844|   125k|  *s1 = _mm256_loadu_si256((__m256i *)src);
  845|   125k|  const __m256i d = _mm256_avg_epu8(s0, *s1);
  846|   125k|  _mm256_storeu_si256((__m256i *)dst, d);
  847|   125k|}
convolve_avx2.c:prepare_half_coeffs_4tap_ssse3:
   82|   510k|    __m128i *const coeffs /* [2] */) {
   83|   510k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
   84|   510k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|   510k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|   510k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
   85|   510k|  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
   86|       |
   87|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
   88|       |  // This extra right shift will be taken care of at the end while rounding
   89|       |  // the result.
   90|       |  // Since all filter co-efficients are even, this change will not affect the
   91|       |  // end result
   92|   510k|  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
   93|   510k|                            _mm_set1_epi16((short)0xffff)));
   94|       |
   95|   510k|  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
   96|       |
   97|       |  // coeffs 2 3 2 3 2 3 2 3
   98|   510k|  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
   99|       |  // coeffs 4 5 4 5 4 5 4 5
  100|   510k|  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
  101|   510k|}
convolve_avx2.c:y_convolve_4tap_2x2_ssse3:
 1157|  51.2k|                                                __m128i ss_128[2]) {
 1158|  51.2k|  s_16[3] = _mm_cvtsi32_si128(loadu_int16(src + stride));
 1159|  51.2k|  const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
 1160|  51.2k|  s_16[2] = _mm_cvtsi32_si128(loadu_int16(src + 2 * stride));
 1161|  51.2k|  const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[2]);
 1162|  51.2k|  ss_128[1] = _mm_unpacklo_epi8(src23, src34);
 1163|  51.2k|  return convolve_4tap_ssse3(ss_128, coeffs);
 1164|  51.2k|}
convolve_avx2.c:convolve_4tap_ssse3:
  438|  1.39M|                                          const __m128i coeffs[2]) {
  439|  1.39M|  const __m128i res_23 = _mm_maddubs_epi16(ss[0], coeffs[0]);
  440|  1.39M|  const __m128i res_45 = _mm_maddubs_epi16(ss[1], coeffs[1]);
  441|  1.39M|  return _mm_add_epi16(res_23, res_45);
  442|  1.39M|}
convolve_avx2.c:y_convolve_4tap_4x2_ssse3:
 1170|   281k|                                                __m128i ss_128[2]) {
 1171|   281k|  s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + stride));
 1172|   281k|  const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
 1173|   281k|  s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
 1174|   281k|  const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
 1175|   281k|  ss_128[1] = _mm_unpacklo_epi8(src23, src34);
 1176|   281k|  return convolve_4tap_ssse3(ss_128, coeffs);
 1177|   281k|}
convolve_avx2.c:prepare_half_coeffs_4tap_avx2:
  179|   169k|    __m256i *const coeffs /* [2] */) {
  180|   169k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  181|   169k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|   169k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|   169k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  182|   169k|  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
  183|       |
  184|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
  185|       |  // This extra right shift will be taken care of at the end while rounding
  186|       |  // the result.
  187|       |  // Since all filter co-efficients are even, this change will not affect the
  188|       |  // end result
  189|   169k|  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
  190|   169k|                            _mm_set1_epi16((short)0xffff)));
  191|   169k|  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
  192|   169k|  populate_coeffs_4tap_avx2(coeffs_1, coeffs);
  193|   169k|}
convolve_avx2.c:populate_coeffs_4tap_avx2:
   24|   169k|                                             __m256i coeffs[2]) {
   25|   169k|  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
   26|       |
   27|       |  // coeffs 2 3 2 3 2 3 2 3
   28|   169k|  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
   29|       |  // coeffs 4 5 4 5 4 5 4 5
   30|   169k|  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
   31|   169k|}
convolve_avx2.c:y_convolve_4tap_8x2_avx2:
 1183|   228k|                                               __m256i ss_256[2]) {
 1184|   228k|  s_64[3] = _mm_loadl_epi64((__m128i *)(src + stride));
 1185|   228k|  const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
  ------------------
  |  |   29|   228k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|   228k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1186|   228k|  s_64[2] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
 1187|   228k|  const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[2]);
  ------------------
  |  |   29|   228k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|   228k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1188|   228k|  ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
 1189|   228k|  return convolve_4tap_avx2(ss_256, coeffs);
 1190|   228k|}
convolve_avx2.c:convolve_4tap_avx2:
  470|  1.20M|                                         const __m256i coeffs[2]) {
  471|  1.20M|  const __m256i res_23 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
  472|  1.20M|  const __m256i res_45 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
  473|  1.20M|  return _mm256_add_epi16(res_23, res_45);
  474|  1.20M|}
convolve_avx2.c:sr_y_round_store_8x2_avx2:
  826|   965k|                                             const ptrdiff_t dst_stride) {
  827|   965k|  const __m256i r = sr_y_round_avx2(res);
  828|   965k|  pack_store_8x2_avx2(r, dst, dst_stride);
  829|   965k|}
convolve_avx2.c:pack_store_8x2_avx2:
  710|  1.96M|                                       const ptrdiff_t stride) {
  711|  1.96M|  const __m256i d = _mm256_packus_epi16(res, res);
  712|  1.96M|  const __m128i d0 = _mm256_castsi256_si128(d);
  713|  1.96M|  const __m128i d1 = _mm256_extracti128_si256(d, 1);
  714|  1.96M|  _mm_storel_epi64((__m128i *)dst, d0);
  715|  1.96M|  _mm_storel_epi64((__m128i *)(dst + stride), d1);
  716|  1.96M|}
convolve_avx2.c:y_convolve_4tap_16x2_avx2:
 1196|   140k|                                             __m256i ss_256[4], __m256i r[2]) {
 1197|   140k|  s_128[3] = _mm_loadu_si128((__m128i *)(src + stride));
 1198|   140k|  const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
  ------------------
  |  |   29|   140k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|   140k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1199|   140k|  s_128[2] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
 1200|   140k|  const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[2]);
  ------------------
  |  |   29|   140k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|   140k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1201|   140k|  ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
 1202|   140k|  ss_256[3] = _mm256_unpackhi_epi8(src23, src34);
 1203|   140k|  r[0] = convolve_4tap_avx2(ss_256, coeffs);
 1204|   140k|  r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
 1205|   140k|}
convolve_avx2.c:y_convolve_4tap_32x2_avx2:
 1222|   173k|    __m256i s_256[4], __m256i ss_256[4], __m256i tt_256[4], __m256i r[4]) {
 1223|   173k|  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
 1224|   173k|  ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
 1225|   173k|  ss_256[3] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
 1226|   173k|  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
 1227|   173k|  tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[2]);
 1228|   173k|  tt_256[3] = _mm256_unpackhi_epi8(s_256[3], s_256[2]);
 1229|   173k|  r[0] = convolve_4tap_avx2(ss_256 + 0, coeffs);
 1230|   173k|  r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
 1231|   173k|  r[2] = convolve_4tap_avx2(tt_256 + 0, coeffs);
 1232|   173k|  r[3] = convolve_4tap_avx2(tt_256 + 2, coeffs);
 1233|   173k|}
convolve_avx2.c:sr_y_round_store_32x2_avx2:
 1992|   995k|                                              const int32_t dst_stride) {
 1993|   995k|  sr_y_round_store_32_avx2(res, dst);
 1994|   995k|  sr_y_round_store_32_avx2(res + 2, dst + dst_stride);
 1995|   995k|}
convolve_avx2.c:prepare_half_coeffs_6tap_ssse3:
  105|   109k|    __m128i *const coeffs /* [3] */) {
  106|   109k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  107|   109k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|   109k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|   109k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  108|   109k|  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
  109|       |
  110|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
  111|       |  // This extra right shift will be taken care of at the end while rounding
  112|       |  // the result.
  113|       |  // Since all filter co-efficients are even, this change will not affect the
  114|       |  // end result
  115|   109k|  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
  116|   109k|                            _mm_set1_epi16((short)0xffff)));
  117|       |
  118|   109k|  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
  119|       |
  120|       |  // coeffs 1 2 1 2 1 2 1 2
  121|   109k|  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u));
  122|       |  // coeffs 3 4 3 4 3 4 3 4
  123|   109k|  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
  124|       |  // coeffs 5 6 5 6 5 6 5 6
  125|   109k|  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0C0Au));
  126|   109k|}
convolve_avx2.c:y_convolve_6tap_2x2_ssse3:
 1211|  74.4k|                                                __m128i ss_128[3]) {
 1212|  74.4k|  s_16[5] = _mm_cvtsi32_si128(loadu_int16(src + 3 * stride));
 1213|  74.4k|  const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
 1214|  74.4k|  s_16[4] = _mm_cvtsi32_si128(loadu_int16(src + 4 * stride));
 1215|  74.4k|  const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[4]);
 1216|  74.4k|  ss_128[2] = _mm_unpacklo_epi8(src45, src56);
 1217|  74.4k|  return convolve_6tap_ssse3(ss_128, coeffs);
 1218|  74.4k|}
convolve_avx2.c:convolve_6tap_ssse3:
  445|   567k|                                          const __m128i coeffs[3]) {
  446|   567k|  const __m128i res_12 = _mm_maddubs_epi16(ss[0], coeffs[0]);
  447|   567k|  const __m128i res_34 = _mm_maddubs_epi16(ss[1], coeffs[1]);
  448|   567k|  const __m128i res_56 = _mm_maddubs_epi16(ss[2], coeffs[2]);
  449|   567k|  const __m128i res_1256 = _mm_add_epi16(res_12, res_56);
  450|   567k|  return _mm_add_epi16(res_1256, res_34);
  451|   567k|}
convolve_avx2.c:y_convolve_6tap_4x2_ssse3:
 1239|   493k|                                                __m128i ss_128[3]) {
 1240|   493k|  s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 3 * stride));
 1241|   493k|  const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
 1242|   493k|  s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 4 * stride));
 1243|   493k|  const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
 1244|   493k|  ss_128[2] = _mm_unpacklo_epi8(src45, src56);
 1245|   493k|  return convolve_6tap_ssse3(ss_128, coeffs);
 1246|   493k|}
convolve_avx2.c:prepare_half_coeffs_6tap_avx2:
  197|   664k|    __m256i *const coeffs /* [3] */) {
  198|   664k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  199|   664k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|   664k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|   664k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  200|   664k|  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
  201|       |
  202|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
  203|       |  // This extra right shift will be taken care of at the end while rounding
  204|       |  // the result.
  205|       |  // Since all filter co-efficients are even, this change will not affect the
  206|       |  // end result
  207|   664k|  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
  208|   664k|                            _mm_set1_epi16((short)0xffff)));
  209|   664k|  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
  210|   664k|  populate_coeffs_6tap_avx2(coeffs_1, coeffs);
  211|   664k|}
convolve_avx2.c:populate_coeffs_6tap_avx2:
   34|   664k|                                             __m256i coeffs[3]) {
   35|   664k|  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
   36|       |
   37|       |  // coeffs 1 2 1 2 1 2 1 2
   38|   664k|  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0402u));
   39|       |  // coeffs 3 4 3 4 3 4 3 4
   40|   664k|  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0806u));
   41|       |  // coeffs 5 6 5 6 5 6 5 6
   42|   664k|  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0C0Au));
   43|   664k|}
convolve_avx2.c:y_convolve_6tap_8x2_avx2:
 1252|   690k|                                               __m256i ss_256[3]) {
 1253|   690k|  s_64[5] = _mm_loadl_epi64((__m128i *)(src + 3 * stride));
 1254|   690k|  const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
  ------------------
  |  |   29|   690k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|   690k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1255|   690k|  s_64[4] = _mm_loadl_epi64((__m128i *)(src + 4 * stride));
 1256|   690k|  const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[4]);
  ------------------
  |  |   29|   690k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|   690k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1257|   690k|  ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
 1258|   690k|  return convolve_6tap_avx2(ss_256, coeffs);
 1259|   690k|}
convolve_avx2.c:convolve_6tap_avx2:
  477|  10.8M|                                         const __m256i coeffs[3]) {
  478|  10.8M|  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
  479|  10.8M|  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
  480|  10.8M|  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
  481|  10.8M|  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
  482|  10.8M|  return _mm256_add_epi16(res_0145, res_23);
  483|  10.8M|}
convolve_avx2.c:y_convolve_6tap_16x2_avx2:
 1265|   602k|                                             __m256i ss_256[6], __m256i r[2]) {
 1266|   602k|  s_128[5] = _mm_loadu_si128((__m128i *)(src + 3 * stride));
 1267|   602k|  const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
  ------------------
  |  |   29|   602k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|   602k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1268|   602k|  s_128[4] = _mm_loadu_si128((__m128i *)(src + 4 * stride));
 1269|   602k|  const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[4]);
  ------------------
  |  |   29|   602k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|   602k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1270|   602k|  ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
 1271|   602k|  ss_256[5] = _mm256_unpackhi_epi8(src45, src56);
 1272|   602k|  r[0] = convolve_6tap_avx2(ss_256, coeffs);
 1273|   602k|  r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
 1274|   602k|}
convolve_avx2.c:y_convolve_6tap_32x2_avx2:
 1278|   713k|    __m256i s_256[6], __m256i ss_256[6], __m256i tt_256[6], __m256i r[4]) {
 1279|   713k|  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
 1280|   713k|  ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
 1281|   713k|  ss_256[5] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
 1282|   713k|  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
 1283|   713k|  tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[4]);
 1284|   713k|  tt_256[5] = _mm256_unpackhi_epi8(s_256[5], s_256[4]);
 1285|   713k|  r[0] = convolve_6tap_avx2(ss_256 + 0, coeffs);
 1286|   713k|  r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
 1287|   713k|  r[2] = convolve_6tap_avx2(tt_256 + 0, coeffs);
 1288|   713k|  r[3] = convolve_6tap_avx2(tt_256 + 3, coeffs);
 1289|   713k|}
convolve_avx2.c:prepare_half_coeffs_8tap_ssse3:
  130|  7.32k|    __m128i *const coeffs /* [4] */) {
  131|  7.32k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  132|  7.32k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  7.32k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  7.32k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  133|  7.32k|  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
  134|       |
  135|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
  136|       |  // This extra right shift will be taken care of at the end while rounding
  137|       |  // the result.
  138|       |  // Since all filter co-efficients are even, this change will not affect the
  139|       |  // end result
  140|  7.32k|  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
  141|  7.32k|                            _mm_set1_epi16((short)0xffff)));
  142|       |
  143|  7.32k|  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
  144|       |
  145|       |  // coeffs 0 1 0 1 0 1 0 1
  146|  7.32k|  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
  147|       |  // coeffs 2 3 2 3 2 3 2 3
  148|  7.32k|  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
  149|       |  // coeffs 4 5 4 5 4 5 4 5
  150|  7.32k|  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
  151|       |  // coeffs 6 7 6 7 6 7 6 7
  152|  7.32k|  coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu));
  153|  7.32k|}
convolve_avx2.c:y_convolve_8tap_2x2_ssse3:
 1295|  5.50k|                                                __m128i ss_128[4]) {
 1296|  5.50k|  s_16[7] = _mm_cvtsi32_si128(loadu_int16(src + 7 * stride));
 1297|  5.50k|  const __m128i src67 = _mm_unpacklo_epi16(s_16[6], s_16[7]);
 1298|  5.50k|  s_16[6] = _mm_cvtsi32_si128(loadu_int16(src + 8 * stride));
 1299|  5.50k|  const __m128i src78 = _mm_unpacklo_epi16(s_16[7], s_16[6]);
 1300|  5.50k|  ss_128[3] = _mm_unpacklo_epi8(src67, src78);
 1301|  5.50k|  return convolve_8tap_ssse3(ss_128, coeffs);
 1302|  5.50k|}
convolve_avx2.c:convolve_8tap_ssse3:
  454|  38.2k|                                          const __m128i coeffs[4]) {
  455|  38.2k|  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
  456|  38.2k|  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
  457|  38.2k|  const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
  458|  38.2k|  const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]);
  459|  38.2k|  const __m128i res_0145 = _mm_add_epi16(res_01, res_45);
  460|  38.2k|  const __m128i res_2367 = _mm_add_epi16(res_23, res_67);
  461|  38.2k|  return _mm_add_epi16(res_0145, res_2367);
  462|  38.2k|}
convolve_avx2.c:y_convolve_8tap_4x2_ssse3:
 1308|  32.7k|                                                __m128i ss_128[4]) {
 1309|  32.7k|  s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * stride));
 1310|  32.7k|  const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
 1311|  32.7k|  s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * stride));
 1312|  32.7k|  const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
 1313|  32.7k|  ss_128[3] = _mm_unpacklo_epi8(src67, src78);
 1314|  32.7k|  return convolve_8tap_ssse3(ss_128, coeffs);
 1315|  32.7k|}
convolve_avx2.c:prepare_half_coeffs_8tap_avx2:
  215|  31.8k|    __m256i *const coeffs /* [4] */) {
  216|  31.8k|  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
  217|  31.8k|      filter_params, subpel_q4 & SUBPEL_MASK);
  ------------------
  |  |   24|  31.8k|#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
  |  |  ------------------
  |  |  |  |   23|  31.8k|#define SUBPEL_BITS 4
  |  |  ------------------
  ------------------
  218|  31.8k|  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
  219|       |
  220|       |  // right shift all filter co-efficients by 1 to reduce the bits required.
  221|       |  // This extra right shift will be taken care of at the end while rounding
  222|       |  // the result.
  223|       |  // Since all filter co-efficients are even, this change will not affect the
  224|       |  // end result
  225|  31.8k|  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
  226|  31.8k|                            _mm_set1_epi16((short)0xffff)));
  227|  31.8k|  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
  228|  31.8k|  populate_coeffs_8tap_avx2(coeffs_1, coeffs);
  229|  31.8k|}
convolve_avx2.c:populate_coeffs_8tap_avx2:
   46|  31.8k|                                             __m256i coeffs[4]) {
   47|  31.8k|  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
   48|       |
   49|       |  // coeffs 0 1 0 1 0 1 0 1
   50|  31.8k|  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0200u));
   51|       |  // coeffs 2 3 2 3 2 3 2 3
   52|  31.8k|  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
   53|       |  // coeffs 4 5 4 5 4 5 4 5
   54|  31.8k|  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
   55|       |  // coeffs 6 7 6 7 6 7 6 7
   56|  31.8k|  coeffs[3] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0e0cu));
   57|  31.8k|}
convolve_avx2.c:y_convolve_8tap_8x2_avx2:
 1321|  47.0k|                                               __m256i ss_256[4]) {
 1322|  47.0k|  s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * stride));
 1323|  47.0k|  const __m256i src67 = _mm256_setr_m128i(s_64[6], s_64[7]);
  ------------------
  |  |   29|  47.0k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  47.0k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1324|  47.0k|  s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * stride));
 1325|  47.0k|  const __m256i src78 = _mm256_setr_m128i(s_64[7], s_64[6]);
  ------------------
  |  |   29|  47.0k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  47.0k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1326|  47.0k|  ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
 1327|  47.0k|  return convolve_8tap_avx2(ss_256, coeffs);
 1328|  47.0k|}
convolve_avx2.c:convolve_8tap_avx2:
  486|  1.01M|                                         const __m256i coeffs[4]) {
  487|  1.01M|  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
  488|  1.01M|  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
  489|  1.01M|  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
  490|  1.01M|  const __m256i res_67 = _mm256_maddubs_epi16(ss[3], coeffs[3]);
  491|  1.01M|  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
  492|  1.01M|  const __m256i res_2367 = _mm256_add_epi16(res_23, res_67);
  493|  1.01M|  return _mm256_add_epi16(res_0145, res_2367);
  494|  1.01M|}
convolve_avx2.c:y_convolve_8tap_16x2_avx2:
 1334|  41.2k|                                             __m256i ss_256[8], __m256i r[2]) {
 1335|  41.2k|  s_128[7] = _mm_loadu_si128((__m128i *)(src + 7 * stride));
 1336|  41.2k|  const __m256i src67 = _mm256_setr_m128i(s_128[6], s_128[7]);
  ------------------
  |  |   29|  41.2k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  41.2k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1337|  41.2k|  s_128[6] = _mm_loadu_si128((__m128i *)(src + 8 * stride));
 1338|  41.2k|  const __m256i src78 = _mm256_setr_m128i(s_128[7], s_128[6]);
  ------------------
  |  |   29|  41.2k|  _mm256_set_m128i((hi), (lo))
  |  |  ------------------
  |  |  |  |   24|  41.2k|  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
  |  |  ------------------
  ------------------
 1339|  41.2k|  ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
 1340|  41.2k|  ss_256[7] = _mm256_unpackhi_epi8(src67, src78);
 1341|  41.2k|  r[0] = convolve_8tap_avx2(ss_256, coeffs);
 1342|  41.2k|  r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
 1343|  41.2k|}
convolve_avx2.c:y_convolve_8tap_32x2_avx2:
 1347|   107k|    __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
 1348|   107k|  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
 1349|   107k|  ss_256[3] = _mm256_unpacklo_epi8(s_256[6], s_256[7]);
 1350|   107k|  ss_256[7] = _mm256_unpackhi_epi8(s_256[6], s_256[7]);
 1351|   107k|  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
 1352|   107k|  tt_256[3] = _mm256_unpacklo_epi8(s_256[7], s_256[6]);
 1353|   107k|  tt_256[7] = _mm256_unpackhi_epi8(s_256[7], s_256[6]);
 1354|   107k|  r[0] = convolve_8tap_avx2(ss_256 + 0, coeffs);
 1355|   107k|  r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
 1356|   107k|  r[2] = convolve_8tap_avx2(tt_256 + 0, coeffs);
 1357|   107k|  r[3] = convolve_8tap_avx2(tt_256 + 4, coeffs);
 1358|   107k|}
convolve_avx2.c:av1_convolve_x_sr_specialized_avx2:
 2940|   828k|    const int32_t subpel_x_q4, ConvolveParams *conv_params) {
 2941|   828k|  int32_t y = h;
 2942|   828k|  __m128i coeffs_128[4];
 2943|   828k|  __m256i coeffs_256[4];
 2944|       |
 2945|   828k|  assert(conv_params->round_0 == 3);
 2946|   828k|  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
 2947|   828k|         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
 2948|   828k|  (void)conv_params;
 2949|       |
 2950|   828k|  const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4);
 2951|       |
 2952|   828k|  if (horz_tap == 2) {
  ------------------
  |  Branch (2952:7): [True: 44.7k, False: 783k]
  ------------------
 2953|       |    // horz_filt as 2 tap
 2954|  44.7k|    const uint8_t *src_ptr = src;
 2955|       |
 2956|  44.7k|    if (subpel_x_q4 != 8) {
  ------------------
  |  Branch (2956:9): [True: 28.3k, False: 16.4k]
  ------------------
 2957|  28.3k|      if (w <= 8) {
  ------------------
  |  Branch (2957:11): [True: 21.1k, False: 7.14k]
  ------------------
 2958|  21.1k|        prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4,
 2959|  21.1k|                                       coeffs_128);
 2960|       |
 2961|  21.1k|        if (w == 2) {
  ------------------
  |  Branch (2961:13): [True: 3.18k, False: 17.9k]
  ------------------
 2962|  5.93k|          do {
 2963|  5.93k|            const __m128i res =
 2964|  5.93k|                x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, coeffs_128);
 2965|  5.93k|            const __m128i r = sr_x_round_sse2(res);
 2966|  5.93k|            pack_store_2x2_sse2(r, dst, dst_stride);
 2967|  5.93k|            src_ptr += 2 * src_stride;
 2968|  5.93k|            dst += 2 * dst_stride;
 2969|  5.93k|            y -= 2;
 2970|  5.93k|          } while (y);
  ------------------
  |  Branch (2970:20): [True: 2.75k, False: 3.18k]
  ------------------
 2971|  17.9k|        } else if (w == 4) {
  ------------------
  |  Branch (2971:20): [True: 9.75k, False: 8.22k]
  ------------------
 2972|  30.9k|          do {
 2973|  30.9k|            const __m128i res =
 2974|  30.9k|                x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
 2975|  30.9k|            const __m128i r = sr_x_round_sse2(res);
 2976|  30.9k|            pack_store_4x2_sse2(r, dst, dst_stride);
 2977|  30.9k|            src_ptr += 2 * src_stride;
 2978|  30.9k|            dst += 2 * dst_stride;
 2979|  30.9k|            y -= 2;
 2980|  30.9k|          } while (y);
  ------------------
  |  Branch (2980:20): [True: 21.2k, False: 9.75k]
  ------------------
 2981|  9.75k|        } else {
 2982|  8.22k|          assert(w == 8);
 2983|       |
 2984|  31.1k|          do {
 2985|  31.1k|            __m128i res[2];
 2986|       |
 2987|  31.1k|            x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, coeffs_128, res);
 2988|  31.1k|            res[0] = sr_x_round_sse2(res[0]);
 2989|  31.1k|            res[1] = sr_x_round_sse2(res[1]);
 2990|  31.1k|            const __m128i d = _mm_packus_epi16(res[0], res[1]);
 2991|  31.1k|            _mm_storel_epi64((__m128i *)dst, d);
 2992|  31.1k|            _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
 2993|       |
 2994|  31.1k|            src_ptr += 2 * src_stride;
 2995|  31.1k|            dst += 2 * dst_stride;
 2996|  31.1k|            y -= 2;
 2997|  31.1k|          } while (y);
  ------------------
  |  Branch (2997:20): [True: 22.9k, False: 8.22k]
  ------------------
 2998|  8.22k|        }
 2999|  21.1k|      } else {
 3000|  7.14k|        prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
 3001|       |
 3002|  7.14k|        if (w == 16) {
  ------------------
  |  Branch (3002:13): [True: 4.35k, False: 2.79k]
  ------------------
 3003|  25.2k|          do {
 3004|  25.2k|            __m256i r[2];
 3005|       |
 3006|  25.2k|            x_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, r);
 3007|  25.2k|            sr_x_round_store_16x2_avx2(r, dst, dst_stride);
 3008|  25.2k|            src_ptr += 2 * src_stride;
 3009|  25.2k|            dst += 2 * dst_stride;
 3010|  25.2k|            y -= 2;
 3011|  25.2k|          } while (y);
  ------------------
  |  Branch (3011:20): [True: 20.8k, False: 4.35k]
  ------------------
 3012|  4.35k|        } else if (w == 32) {
  ------------------
  |  Branch (3012:20): [True: 1.57k, False: 1.21k]
  ------------------
 3013|  38.1k|          do {
 3014|  38.1k|            sr_x_2tap_32_avx2(src_ptr, coeffs_256, dst);
 3015|  38.1k|            src_ptr += src_stride;
 3016|  38.1k|            dst += dst_stride;
 3017|  38.1k|          } while (--y);
  ------------------
  |  Branch (3017:20): [True: 36.5k, False: 1.57k]
  ------------------
 3018|  1.57k|        } else if (w == 64) {
  ------------------
  |  Branch (3018:20): [True: 969, False: 248]
  ------------------
 3019|  46.9k|          do {
 3020|  46.9k|            sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
 3021|  46.9k|            sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
 3022|  46.9k|            src_ptr += src_stride;
 3023|  46.9k|            dst += dst_stride;
 3024|  46.9k|          } while (--y);
  ------------------
  |  Branch (3024:20): [True: 45.9k, False: 969]
  ------------------
 3025|    969|        } else {
 3026|    248|          assert(w == 128);
 3027|       |
 3028|  25.0k|          do {
 3029|  25.0k|            sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
 3030|  25.0k|            sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
 3031|  25.0k|            sr_x_2tap_32_avx2(src_ptr + 2 * 32, coeffs_256, dst + 2 * 32);
 3032|  25.0k|            sr_x_2tap_32_avx2(src_ptr + 3 * 32, coeffs_256, dst + 3 * 32);
 3033|  25.0k|            src_ptr += src_stride;
 3034|  25.0k|            dst += dst_stride;
 3035|  25.0k|          } while (--y);
  ------------------
  |  Branch (3035:20): [True: 24.7k, False: 248]
  ------------------
 3036|    248|        }
 3037|  7.14k|      }
 3038|  28.3k|    } else {
 3039|       |      // average to get half pel
 3040|  16.4k|      if (w == 2) {
  ------------------
  |  Branch (3040:11): [True: 1.89k, False: 14.5k]
  ------------------
 3041|  3.92k|        do {
 3042|  3.92k|          __m128i s_128;
 3043|       |
 3044|  3.92k|          s_128 = load_u8_4x2_sse4_1(src_ptr, src_stride);
 3045|  3.92k|          const __m128i s1 = _mm_srli_si128(s_128, 1);
 3046|  3.92k|          const __m128i d = _mm_avg_epu8(s_128, s1);
 3047|  3.92k|          *(uint16_t *)dst = (uint16_t)_mm_cvtsi128_si32(d);
 3048|  3.92k|          *(uint16_t *)(dst + dst_stride) = _mm_extract_epi16(d, 2);
 3049|       |
 3050|  3.92k|          src_ptr += 2 * src_stride;
 3051|  3.92k|          dst += 2 * dst_stride;
 3052|  3.92k|          y -= 2;
 3053|  3.92k|        } while (y);
  ------------------
  |  Branch (3053:18): [True: 2.03k, False: 1.89k]
  ------------------
 3054|  14.5k|      } else if (w == 4) {
  ------------------
  |  Branch (3054:18): [True: 6.17k, False: 8.34k]
  ------------------
 3055|  17.5k|        do {
 3056|  17.5k|          __m128i s_128;
 3057|       |
 3058|  17.5k|          s_128 = load_u8_8x2_sse2(src_ptr, src_stride);
 3059|  17.5k|          const __m128i s1 = _mm_srli_si128(s_128, 1);
 3060|  17.5k|          const __m128i d = _mm_avg_epu8(s_128, s1);
 3061|  17.5k|          xx_storel_32(dst, d);
 3062|  17.5k|          *(int32_t *)(dst + dst_stride) = _mm_extract_epi32(d, 2);
 3063|       |
 3064|  17.5k|          src_ptr += 2 * src_stride;
 3065|  17.5k|          dst += 2 * dst_stride;
 3066|  17.5k|          y -= 2;
 3067|  17.5k|        } while (y);
  ------------------
  |  Branch (3067:18): [True: 11.3k, False: 6.17k]
  ------------------
 3068|  8.34k|      } else if (w == 8) {
  ------------------
  |  Branch (3068:18): [True: 4.55k, False: 3.79k]
  ------------------
 3069|  16.4k|        do {
 3070|  16.4k|          const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
 3071|  16.4k|          const __m128i s10 =
 3072|  16.4k|              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
 3073|  16.4k|          const __m128i s01 = _mm_srli_si128(s00, 1);
 3074|  16.4k|          const __m128i s11 = _mm_srli_si128(s10, 1);
 3075|  16.4k|          const __m128i d0 = _mm_avg_epu8(s00, s01);
 3076|  16.4k|          const __m128i d1 = _mm_avg_epu8(s10, s11);
 3077|  16.4k|          _mm_storel_epi64((__m128i *)dst, d0);
 3078|  16.4k|          _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
 3079|       |
 3080|  16.4k|          src_ptr += 2 * src_stride;
 3081|  16.4k|          dst += 2 * dst_stride;
 3082|  16.4k|          y -= 2;
 3083|  16.4k|        } while (y);
  ------------------
  |  Branch (3083:18): [True: 11.8k, False: 4.55k]
  ------------------
 3084|  4.55k|      } else if (w == 16) {
  ------------------
  |  Branch (3084:18): [True: 2.25k, False: 1.53k]
  ------------------
 3085|  13.3k|        do {
 3086|  13.3k|          const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
 3087|  13.3k|          const __m128i s01 = _mm_loadu_si128((__m128i *)(src_ptr + 1));
 3088|  13.3k|          const __m128i s10 =
 3089|  13.3k|              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
 3090|  13.3k|          const __m128i s11 =
 3091|  13.3k|              _mm_loadu_si128((__m128i *)(src_ptr + src_stride + 1));
 3092|  13.3k|          const __m128i d0 = _mm_avg_epu8(s00, s01);
 3093|  13.3k|          const __m128i d1 = _mm_avg_epu8(s10, s11);
 3094|  13.3k|          _mm_storeu_si128((__m128i *)dst, d0);
 3095|  13.3k|          _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
 3096|       |
 3097|  13.3k|          src_ptr += 2 * src_stride;
 3098|  13.3k|          dst += 2 * dst_stride;
 3099|  13.3k|          y -= 2;
 3100|  13.3k|        } while (y);
  ------------------
  |  Branch (3100:18): [True: 11.1k, False: 2.25k]
  ------------------
 3101|  2.25k|      } else if (w == 32) {
  ------------------
  |  Branch (3101:18): [True: 927, False: 608]
  ------------------
 3102|  28.7k|        do {
 3103|  28.7k|          sr_x_2tap_32_avg_avx2(src_ptr, dst);
 3104|  28.7k|          src_ptr += src_stride;
 3105|  28.7k|          dst += dst_stride;
 3106|  28.7k|        } while (--y);
  ------------------
  |  Branch (3106:18): [True: 27.7k, False: 927]
  ------------------
 3107|    927|      } else if (w == 64) {
  ------------------
  |  Branch (3107:18): [True: 468, False: 140]
  ------------------
 3108|  25.1k|        do {
 3109|  25.1k|          sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
 3110|  25.1k|          sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
 3111|  25.1k|          src_ptr += src_stride;
 3112|  25.1k|          dst += dst_stride;
 3113|  25.1k|        } while (--y);
  ------------------
  |  Branch (3113:18): [True: 24.7k, False: 468]
  ------------------
 3114|    468|      } else {
 3115|    140|        assert(w == 128);
 3116|       |
 3117|  14.0k|        do {
 3118|  14.0k|          sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
 3119|  14.0k|          sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
 3120|  14.0k|          sr_x_2tap_32_avg_avx2(src_ptr + 2 * 32, dst + 2 * 32);
 3121|  14.0k|          sr_x_2tap_32_avg_avx2(src_ptr + 3 * 32, dst + 3 * 32);
 3122|  14.0k|          src_ptr += src_stride;
 3123|  14.0k|          dst += dst_stride;
 3124|  14.0k|        } while (--y);
  ------------------
  |  Branch (3124:18): [True: 13.9k, False: 140]
  ------------------
 3125|    140|      }
 3126|  16.4k|    }
 3127|   783k|  } else if (horz_tap == 4) {
  ------------------
  |  Branch (3127:14): [True: 339k, False: 443k]
  ------------------
 3128|       |    // horz_filt as 4 tap
 3129|   339k|    const uint8_t *src_ptr = src - 1;
 3130|       |
 3131|   339k|    prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
 3132|       |
 3133|   339k|    if (w == 2) {
  ------------------
  |  Branch (3133:9): [True: 64.4k, False: 275k]
  ------------------
 3134|   173k|      do {
 3135|   173k|        const __m128i res =
 3136|   173k|            x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
 3137|   173k|        const __m128i r = sr_x_round_sse2(res);
 3138|   173k|        pack_store_2x2_sse2(r, dst, dst_stride);
 3139|   173k|        src_ptr += 2 * src_stride;
 3140|   173k|        dst += 2 * dst_stride;
 3141|   173k|        y -= 2;
 3142|   173k|      } while (y);
  ------------------
  |  Branch (3142:16): [True: 109k, False: 64.4k]
  ------------------
 3143|   275k|    } else if (w == 4) {
  ------------------
  |  Branch (3143:16): [True: 250k, False: 25.3k]
  ------------------
 3144|   891k|      do {
 3145|   891k|        const __m128i res =
 3146|   891k|            x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
 3147|   891k|        const __m128i r = sr_x_round_sse2(res);
 3148|   891k|        pack_store_4x2_sse2(r, dst, dst_stride);
 3149|   891k|        src_ptr += 2 * src_stride;
 3150|   891k|        dst += 2 * dst_stride;
 3151|   891k|        y -= 2;
 3152|   891k|      } while (y);
  ------------------
  |  Branch (3152:16): [True: 641k, False: 250k]
  ------------------
 3153|   250k|    } else if (w == 8) {
  ------------------
  |  Branch (3153:16): [True: 15.0k, False: 10.2k]
  ------------------
 3154|       |      // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
 3155|       |      // rewrite this for better performance later.
 3156|  15.0k|      __m256i filt_256[2];
 3157|  15.0k|      prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
 3158|       |
 3159|  15.0k|      filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
 3160|  15.0k|      filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
 3161|  71.5k|      for (int i = 0; i < h; i += 2) {
  ------------------
  |  Branch (3161:23): [True: 56.5k, False: 15.0k]
  ------------------
 3162|  56.5k|        const __m256i data = _mm256_permute2x128_si256(
 3163|  56.5k|            _mm256_castsi128_si256(
 3164|  56.5k|                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
 3165|  56.5k|            _mm256_castsi128_si256(_mm_loadu_si128(
 3166|  56.5k|                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
 3167|  56.5k|            0x20);
 3168|       |
 3169|  56.5k|        __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
 3170|  56.5k|        res_16b = sr_x_round_avx2(res_16b);
 3171|       |
 3172|  56.5k|        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
 3173|       |
 3174|  56.5k|        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
 3175|  56.5k|        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
 3176|       |
 3177|  56.5k|        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
 3178|  56.5k|        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
 3179|  56.5k|      }
 3180|  15.0k|    } else {
 3181|  10.2k|      assert(!(w % 16));
 3182|       |      // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
 3183|       |      // rewrite this for better performance later.
 3184|  10.2k|      __m256i filt_256[2];
 3185|  10.2k|      prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
 3186|  10.2k|      filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
 3187|  10.2k|      filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
 3188|       |
 3189|   215k|      for (int i = 0; i < h; ++i) {
  ------------------
  |  Branch (3189:23): [True: 204k, False: 10.2k]
  ------------------
 3190|   764k|        for (int j = 0; j < w; j += 16) {
  ------------------
  |  Branch (3190:25): [True: 559k, False: 204k]
  ------------------
 3191|       |          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
 3192|       |          // 18 19 20 21 22 23
 3193|   559k|          const __m256i data = _mm256_inserti128_si256(
 3194|   559k|              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
 3195|   559k|              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
 3196|   559k|              1);
 3197|       |
 3198|   559k|          __m256i res_16b =
 3199|   559k|              convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
 3200|   559k|          res_16b = sr_x_round_avx2(res_16b);
 3201|       |
 3202|       |          /* rounding code */
 3203|       |          // 8 bit conversion and saturation to uint8
 3204|   559k|          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
 3205|       |
 3206|       |          // Store values into the destination buffer
 3207|       |          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
 3208|   559k|          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
 3209|   559k|          __m128i res = _mm256_castsi256_si128(res_8b);
 3210|   559k|          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
 3211|   559k|        }
 3212|   204k|      }
 3213|  10.2k|    }
 3214|   443k|  } else {
 3215|   443k|    __m256i filt_256[4];
 3216|       |
 3217|   443k|    filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
 3218|   443k|    filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
 3219|   443k|    filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
 3220|       |
 3221|   443k|    if (horz_tap == 6) {
  ------------------
  |  Branch (3221:9): [True: 427k, False: 15.9k]
  ------------------
 3222|       |      // horz_filt as 6 tap
 3223|   427k|      const uint8_t *src_ptr = src - 2;
 3224|       |
 3225|   427k|      prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
 3226|       |
 3227|   427k|      if (w == 8) {
  ------------------
  |  Branch (3227:11): [True: 237k, False: 189k]
  ------------------
 3228|   961k|        do {
 3229|   961k|          const __m256i res = x_convolve_6tap_8x2_avx2(src_ptr, src_stride,
 3230|   961k|                                                       coeffs_256, filt_256);
 3231|   961k|          sr_x_round_store_8x2_avx2(res, dst, dst_stride);
 3232|   961k|          src_ptr += 2 * src_stride;
 3233|   961k|          dst += 2 * dst_stride;
 3234|   961k|          y -= 2;
 3235|   961k|        } while (y);
  ------------------
  |  Branch (3235:18): [True: 723k, False: 237k]
  ------------------
 3236|   237k|      } else if (w == 16) {
  ------------------
  |  Branch (3236:18): [True: 146k, False: 43.2k]
  ------------------
 3237|   788k|        do {
 3238|   788k|          __m256i r[2];
 3239|       |
 3240|   788k|          x_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
 3241|   788k|                                    r);
 3242|   788k|          sr_x_round_store_16x2_avx2(r, dst, dst_stride);
 3243|   788k|          src_ptr += 2 * src_stride;
 3244|   788k|          dst += 2 * dst_stride;
 3245|   788k|          y -= 2;
 3246|   788k|        } while (y);
  ------------------
  |  Branch (3246:18): [True: 641k, False: 146k]
  ------------------
 3247|   146k|      } else if (w == 32) {
  ------------------
  |  Branch (3247:18): [True: 36.5k, False: 6.67k]
  ------------------
 3248|   696k|        do {
 3249|   696k|          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
 3250|   696k|          src_ptr += src_stride;
 3251|   696k|          dst += dst_stride;
 3252|   696k|        } while (--y);
  ------------------
  |  Branch (3252:18): [True: 659k, False: 36.5k]
  ------------------
 3253|  36.5k|      } else if (w == 64) {
  ------------------
  |  Branch (3253:18): [True: 5.58k, False: 1.08k]
  ------------------
 3254|   279k|        do {
 3255|   279k|          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
 3256|   279k|          sr_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
 3257|   279k|          src_ptr += src_stride;
 3258|   279k|          dst += dst_stride;
 3259|   279k|        } while (--y);
  ------------------
  |  Branch (3259:18): [True: 274k, False: 5.58k]
  ------------------
 3260|  5.58k|      } else {
 3261|  1.08k|        assert(w == 128);
 3262|       |
 3263|   130k|        do {
 3264|   130k|          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
 3265|   130k|          sr_x_6tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
 3266|   130k|                            dst + 1 * 32);
 3267|   130k|          sr_x_6tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
 3268|   130k|                            dst + 2 * 32);
 3269|   130k|          sr_x_6tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
 3270|   130k|                            dst + 3 * 32);
 3271|   130k|          src_ptr += src_stride;
 3272|   130k|          dst += dst_stride;
 3273|   130k|        } while (--y);
  ------------------
  |  Branch (3273:18): [True: 129k, False: 1.10k]
  ------------------
 3274|  1.10k|      }
 3275|   427k|    } else if (horz_tap == 8) {
  ------------------
  |  Branch (3275:16): [True: 15.9k, False: 18.4E]
  ------------------
 3276|       |      // horz_filt as 8 tap
 3277|  15.9k|      const uint8_t *src_ptr = src - 3;
 3278|       |
 3279|  15.9k|      filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx2);
 3280|       |
 3281|  15.9k|      prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
 3282|       |
 3283|  15.9k|      if (w == 8) {
  ------------------
  |  Branch (3283:11): [True: 8.42k, False: 7.56k]
  ------------------
 3284|  36.2k|        do {
 3285|  36.2k|          const __m256i res = x_convolve_8tap_8x2_avx2(src_ptr, src_stride,
 3286|  36.2k|                                                       coeffs_256, filt_256);
 3287|  36.2k|          sr_x_round_store_8x2_avx2(res, dst, dst_stride);
 3288|  36.2k|          src_ptr += 2 * src_stride;
 3289|  36.2k|          dst += 2 * dst_stride;
 3290|  36.2k|          y -= 2;
 3291|  36.2k|        } while (y);
  ------------------
  |  Branch (3291:18): [True: 27.8k, False: 8.42k]
  ------------------
 3292|  8.42k|      } else if (w == 16) {
  ------------------
  |  Branch (3292:18): [True: 4.95k, False: 2.61k]
  ------------------
 3293|  29.3k|        do {
 3294|  29.3k|          __m256i r[2];
 3295|       |
 3296|  29.3k|          x_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
 3297|  29.3k|                                    r);
 3298|  29.3k|          sr_x_round_store_16x2_avx2(r, dst, dst_stride);
 3299|  29.3k|          src_ptr += 2 * src_stride;
 3300|  29.3k|          dst += 2 * dst_stride;
 3301|  29.3k|          y -= 2;
 3302|  29.3k|        } while (y);
  ------------------
  |  Branch (3302:18): [True: 24.3k, False: 4.95k]
  ------------------
 3303|  4.95k|      } else if (w == 32) {
  ------------------
  |  Branch (3303:18): [True: 1.77k, False: 836]
  ------------------
 3304|  39.6k|        do {
 3305|  39.6k|          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
 3306|  39.6k|          src_ptr += src_stride;
 3307|  39.6k|          dst += dst_stride;
 3308|  39.6k|        } while (--y);
  ------------------
  |  Branch (3308:18): [True: 37.8k, False: 1.77k]
  ------------------
 3309|  1.77k|      } else if (w == 64) {
  ------------------
  |  Branch (3309:18): [True: 656, False: 180]
  ------------------
 3310|  31.9k|        do {
 3311|  31.9k|          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
 3312|  31.9k|          sr_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
 3313|  31.9k|          src_ptr += src_stride;
 3314|  31.9k|          dst += dst_stride;
 3315|  31.9k|        } while (--y);
  ------------------
  |  Branch (3315:18): [True: 31.3k, False: 656]
  ------------------
 3316|    656|      } else {
 3317|    180|        assert(w == 128);
 3318|       |
 3319|  19.0k|        do {
 3320|  19.0k|          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
 3321|  19.0k|          sr_x_8tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
 3322|  19.0k|                            dst + 1 * 32);
 3323|  19.0k|          sr_x_8tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
 3324|  19.0k|                            dst + 2 * 32);
 3325|  19.0k|          sr_x_8tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
 3326|  19.0k|                            dst + 3 * 32);
 3327|  19.0k|          src_ptr += src_stride;
 3328|  19.0k|          dst += dst_stride;
 3329|  19.0k|        } while (--y);
  ------------------
  |  Branch (3329:18): [True: 18.8k, False: 180]
  ------------------
 3330|    180|      }
 3331|  15.9k|    }
 3332|   443k|  }
 3333|   828k|}
convolve_avx2.c:x_convolve_2tap_2x2_sse4_1:
  859|  5.93k|                                                 const __m128i coeffs[1]) {
  860|  5.93k|  const __m128i sfl =
  861|  5.93k|      _mm_setr_epi8(0, 1, 1, 2, 4, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0);
  862|  5.93k|  const __m128i s_128 = load_u8_4x2_sse4_1(src, stride);
  863|  5.93k|  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
  864|  5.93k|  return convolve_2tap_ssse3(&ss, coeffs);
  865|  5.93k|}
convolve_avx2.c:sr_x_round_sse2:
  780|  1.16M|static inline __m128i sr_x_round_sse2(const __m128i src) {
  781|  1.16M|  const __m128i round = _mm_set1_epi16(34);
  782|  1.16M|  const __m128i dst = _mm_add_epi16(src, round);
  783|  1.16M|  return _mm_srai_epi16(dst, 6);
  784|  1.16M|}
convolve_avx2.c:x_convolve_2tap_4x2_ssse3:
  869|  30.9k|                                                const __m128i coeffs[1]) {
  870|  30.9k|  const __m128i sfl =
  871|  30.9k|      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
  872|  30.9k|  const __m128i s_128 = load_u8_8x2_sse2(src, stride);
  873|  30.9k|  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
  874|  30.9k|  return convolve_2tap_ssse3(&ss, coeffs);
  875|  30.9k|}
convolve_avx2.c:x_convolve_2tap_8x2_ssse3:
  880|  31.1k|                                             __m128i r[2]) {
  881|  31.1k|  __m128i ss[2];
  882|  31.1k|  const __m128i s00 = _mm_loadu_si128((__m128i *)src);
  883|  31.1k|  const __m128i s10 = _mm_loadu_si128((__m128i *)(src + stride));
  884|  31.1k|  const __m128i s01 = _mm_srli_si128(s00, 1);
  885|  31.1k|  const __m128i s11 = _mm_srli_si128(s10, 1);
  886|  31.1k|  ss[0] = _mm_unpacklo_epi8(s00, s01);
  887|  31.1k|  ss[1] = _mm_unpacklo_epi8(s10, s11);
  888|       |
  889|  31.1k|  r[0] = convolve_2tap_ssse3(&ss[0], coeffs);
  890|  31.1k|  r[1] = convolve_2tap_ssse3(&ss[1], coeffs);
  891|  31.1k|}
convolve_avx2.c:x_convolve_2tap_16x2_avx2:
  912|  25.2k|                                             __m256i r[2]) {
  913|  25.2k|  const __m256i s0_256 = loadu_8bit_16x2_avx2(src, stride);
  914|  25.2k|  const __m256i s1_256 = loadu_8bit_16x2_avx2(src + 1, stride);
  915|  25.2k|  const __m256i s0 = _mm256_unpacklo_epi8(s0_256, s1_256);
  916|  25.2k|  const __m256i s1 = _mm256_unpackhi_epi8(s0_256, s1_256);
  917|  25.2k|  r[0] = convolve_2tap_avx2(&s0, coeffs);
  918|  25.2k|  r[1] = convolve_2tap_avx2(&s1, coeffs);
  919|  25.2k|}
convolve_avx2.c:sr_x_round_store_16x2_avx2:
  807|   842k|                                              const ptrdiff_t dst_stride) {
  808|   842k|  __m256i r[2];
  809|       |
  810|   842k|  r[0] = sr_x_round_avx2(res[0]);
  811|   842k|  r[1] = sr_x_round_avx2(res[1]);
  812|   842k|  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
  813|   842k|}
convolve_avx2.c:sr_x_2tap_32_avx2:
 2910|   232k|                                     uint8_t *const dst) {
 2911|   232k|  __m256i r[2];
 2912|       |
 2913|   232k|  x_convolve_2tap_32_avx2(src, coeffs, r);
 2914|   232k|  sr_x_round_store_32_avx2(r, dst);
 2915|   232k|}
convolve_avx2.c:x_convolve_2tap_32_avx2:
  923|   232k|                                           __m256i r[2]) {
  924|   232k|  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
  925|   232k|  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
  926|   232k|  const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
  927|   232k|  const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
  928|       |
  929|   232k|  r[0] = convolve_2tap_avx2(&ss0, coeffs);
  930|   232k|  r[1] = convolve_2tap_avx2(&ss1, coeffs);
  931|   232k|}
convolve_avx2.c:sr_x_round_store_32_avx2:
  816|  2.18M|                                            uint8_t *const dst) {
  817|  2.18M|  __m256i r[2];
  818|       |
  819|  2.18M|  r[0] = sr_x_round_avx2(res[0]);
  820|  2.18M|  r[1] = sr_x_round_avx2(res[1]);
  821|  2.18M|  convolve_store_32_avx2(r[0], r[1], dst);
  822|  2.18M|}
convolve_avx2.c:sr_x_2tap_32_avg_avx2:
  850|   135k|                                         uint8_t *const dst) {
  851|   135k|  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
  852|   135k|  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
  853|   135k|  const __m256i d = _mm256_avg_epu8(s0, s1);
  854|   135k|  _mm256_storeu_si256((__m256i *)dst, d);
  855|   135k|}
convolve_avx2.c:x_convolve_4tap_2x2_ssse3:
  935|   173k|                                                const __m128i coeffs[2]) {
  936|   173k|  const __m128i sfl0 =
  937|   173k|      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
  938|   173k|  const __m128i sfl1 =
  939|   173k|      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
  940|   173k|  const __m128i s = load_u8_8x2_sse2(src, stride);
  941|   173k|  __m128i ss[2];
  942|       |
  943|   173k|  ss[0] = _mm_shuffle_epi8(s, sfl0);
  944|   173k|  ss[1] = _mm_shuffle_epi8(s, sfl1);
  945|   173k|  return convolve_4tap_ssse3(ss, coeffs);
  946|   173k|}
convolve_avx2.c:x_convolve_4tap_4x2_ssse3:
  950|   891k|                                                const __m128i coeffs[2]) {
  951|   891k|  const __m128i s = load_u8_8x2_sse2(src, stride);
  952|   891k|  const __m128i sfl0 =
  953|   891k|      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
  954|   891k|  const __m128i sfl1 =
  955|   891k|      _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
  956|   891k|  __m128i ss[2];
  957|       |
  958|   891k|  ss[0] = _mm_shuffle_epi8(s, sfl0);
  959|   891k|  ss[1] = _mm_shuffle_epi8(s, sfl1);
  960|   891k|  return convolve_4tap_ssse3(ss, coeffs);
  961|   891k|}
convolve_avx2.c:sr_x_round_avx2:
  786|  7.66M|static inline __m256i sr_x_round_avx2(const __m256i src) {
  787|  7.66M|  const __m256i round = _mm256_set1_epi16(34);
  788|  7.66M|  const __m256i dst = _mm256_add_epi16(src, round);
  789|  7.66M|  return _mm256_srai_epi16(dst, 6);
  790|  7.66M|}
convolve_avx2.c:x_convolve_6tap_8x2_avx2:
 1031|  2.53M|                                               const __m256i filt[3]) {
 1032|  2.53M|  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
 1033|  2.53M|  return x_convolve_6tap_avx2(s_256, coeffs, filt);
 1034|  2.53M|}
convolve_avx2.c:x_convolve_6tap_avx2:
  573|  6.08M|                                           const __m256i filt[3]) {
  574|  6.08M|  __m256i ss[3];
  575|       |
  576|  6.08M|  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
  577|  6.08M|  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
  578|  6.08M|  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
  579|       |
  580|  6.08M|  return convolve_6tap_avx2(ss, coeffs);
  581|  6.08M|}
convolve_avx2.c:sr_x_round_store_8x2_avx2:
  800|   997k|                                             const ptrdiff_t dst_stride) {
  801|   997k|  const __m256i r = sr_x_round_avx2(res);
  802|   997k|  pack_store_8x2_avx2(r, dst, dst_stride);
  803|   997k|}
convolve_avx2.c:x_convolve_6tap_16x2_avx2:
 1040|   788k|                                             __m256i r[2]) {
 1041|   788k|  r[0] = x_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
 1042|   788k|  r[1] = x_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
 1043|   788k|}
convolve_avx2.c:sr_x_6tap_32_avx2:
 2920|  1.77M|                                     uint8_t *const dst) {
 2921|  1.77M|  __m256i r[2];
 2922|       |
 2923|  1.77M|  x_convolve_6tap_32_avx2(src, coeffs, filt, r);
 2924|  1.77M|  sr_x_round_store_32_avx2(r, dst);
 2925|  1.77M|}
convolve_avx2.c:x_convolve_6tap_32_avx2:
 1048|  1.77M|                                           __m256i r[2]) {
 1049|  1.77M|  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
 1050|  1.77M|  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
 1051|       |
 1052|  1.77M|  r[0] = x_convolve_6tap_avx2(s0_256, coeffs, filt);
 1053|  1.77M|  r[1] = x_convolve_6tap_avx2(s1_256, coeffs, filt);
 1054|  1.77M|}
convolve_avx2.c:x_convolve_8tap_8x2_avx2:
 1059|  94.8k|                                               const __m256i filt[4]) {
 1060|  94.8k|  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
 1061|  94.8k|  return x_convolve_8tap_avx2(s_256, coeffs, filt);
 1062|  94.8k|}
convolve_avx2.c:x_convolve_8tap_avx2:
  585|   454k|                                           const __m256i filt[4]) {
  586|   454k|  __m256i ss[4];
  587|       |
  588|   454k|  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
  589|   454k|  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
  590|   454k|  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
  591|   454k|  ss[3] = _mm256_shuffle_epi8(data, filt[3]);
  592|       |
  593|   454k|  return convolve_8tap_avx2(ss, coeffs);
  594|   454k|}
convolve_avx2.c:x_convolve_8tap_16x2_avx2:
 1068|  29.3k|                                                       __m256i r[2]) {
 1069|  29.3k|  r[0] = x_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
 1070|  29.3k|  r[1] = x_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
 1071|  29.3k|}
convolve_avx2.c:sr_x_8tap_32_avx2:
 2930|   179k|                                               uint8_t *const dst) {
 2931|   179k|  __m256i r[2];
 2932|       |
 2933|   179k|  x_convolve_8tap_32_avx2(src, coeffs, filt, r);
 2934|   179k|  sr_x_round_store_32_avx2(r, dst);
 2935|   179k|}
convolve_avx2.c:x_convolve_8tap_32_avx2:
 1076|   179k|                                                     __m256i r[2]) {
 1077|   179k|  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
 1078|   179k|  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
 1079|       |
 1080|   179k|  r[0] = x_convolve_8tap_avx2(s0_256, coeffs, filt);
 1081|   179k|  r[1] = x_convolve_8tap_avx2(s1_256, coeffs, filt);
 1082|   179k|}

convolve_2d_avx2.c:load_u8_8x2_sse2:
   19|  4.28M|                                       const ptrdiff_t stride) {
   20|  4.28M|  return load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * stride));
   21|  4.28M|}
convolve_2d_avx2.c:store_u8_4x2_sse2:
   25|  79.8k|                                               const ptrdiff_t stride) {
   26|  79.8k|  xx_storel_32(dst, src);
   27|  79.8k|  *(uint32_t *)(dst + stride) =
   28|  79.8k|      ((uint32_t)_mm_extract_epi16(src, 3) << 16) | _mm_extract_epi16(src, 2);
   29|  79.8k|}
convolve_avx2.c:store_u8_4x2_sse2:
   25|  1.75M|                                               const ptrdiff_t stride) {
   26|  1.75M|  xx_storel_32(dst, src);
   27|  1.75M|  *(uint32_t *)(dst + stride) =
   28|  1.75M|      ((uint32_t)_mm_extract_epi16(src, 3) << 16) | _mm_extract_epi16(src, 2);
   29|  1.75M|}
convolve_avx2.c:load_u8_8x2_sse2:
   19|  1.11M|                                       const ptrdiff_t stride) {
   20|  1.11M|  return load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * stride));
   21|  1.11M|}

aom_dsp_rtcd.c:setup_rtcd_internal:
 1711|      1|{
 1712|      1|    int flags = x86_simd_caps();
 1713|       |
 1714|      1|    (void)flags;
 1715|       |
 1716|      1|    aom_blend_a64_hmask = aom_blend_a64_hmask_c;
 1717|      1|    if (flags & HAS_SSE4_1) aom_blend_a64_hmask = aom_blend_a64_hmask_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (1717:9): [True: 1, False: 0]
  ------------------
 1718|      1|    aom_blend_a64_mask = aom_blend_a64_mask_c;
 1719|      1|    if (flags & HAS_SSE4_1) aom_blend_a64_mask = aom_blend_a64_mask_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (1719:9): [True: 1, False: 0]
  ------------------
 1720|      1|    if (flags & HAS_AVX2) aom_blend_a64_mask = aom_blend_a64_mask_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1720:9): [True: 1, False: 0]
  ------------------
 1721|      1|    aom_blend_a64_vmask = aom_blend_a64_vmask_c;
 1722|      1|    if (flags & HAS_SSE4_1) aom_blend_a64_vmask = aom_blend_a64_vmask_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (1722:9): [True: 1, False: 0]
  ------------------
 1723|      1|    aom_convolve8_horiz = aom_convolve8_horiz_c;
 1724|      1|    if (flags & HAS_SSSE3) aom_convolve8_horiz = aom_convolve8_horiz_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1724:9): [True: 1, False: 0]
  ------------------
 1725|      1|    if (flags & HAS_AVX2) aom_convolve8_horiz = aom_convolve8_horiz_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1725:9): [True: 1, False: 0]
  ------------------
 1726|      1|    aom_convolve8_vert = aom_convolve8_vert_c;
 1727|      1|    if (flags & HAS_SSSE3) aom_convolve8_vert = aom_convolve8_vert_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1727:9): [True: 1, False: 0]
  ------------------
 1728|      1|    if (flags & HAS_AVX2) aom_convolve8_vert = aom_convolve8_vert_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1728:9): [True: 1, False: 0]
  ------------------
 1729|      1|    aom_convolve_copy = aom_convolve_copy_sse2;
 1730|      1|    if (flags & HAS_AVX2) aom_convolve_copy = aom_convolve_copy_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1730:9): [True: 1, False: 0]
  ------------------
 1731|      1|    aom_dc_128_predictor_32x16 = aom_dc_128_predictor_32x16_sse2;
 1732|      1|    if (flags & HAS_AVX2) aom_dc_128_predictor_32x16 = aom_dc_128_predictor_32x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1732:9): [True: 1, False: 0]
  ------------------
 1733|      1|    aom_dc_128_predictor_32x32 = aom_dc_128_predictor_32x32_sse2;
 1734|      1|    if (flags & HAS_AVX2) aom_dc_128_predictor_32x32 = aom_dc_128_predictor_32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1734:9): [True: 1, False: 0]
  ------------------
 1735|      1|    aom_dc_128_predictor_32x64 = aom_dc_128_predictor_32x64_sse2;
 1736|      1|    if (flags & HAS_AVX2) aom_dc_128_predictor_32x64 = aom_dc_128_predictor_32x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1736:9): [True: 1, False: 0]
  ------------------
 1737|      1|    aom_dc_128_predictor_64x16 = aom_dc_128_predictor_64x16_sse2;
 1738|      1|    if (flags & HAS_AVX2) aom_dc_128_predictor_64x16 = aom_dc_128_predictor_64x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1738:9): [True: 1, False: 0]
  ------------------
 1739|      1|    aom_dc_128_predictor_64x32 = aom_dc_128_predictor_64x32_sse2;
 1740|      1|    if (flags & HAS_AVX2) aom_dc_128_predictor_64x32 = aom_dc_128_predictor_64x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1740:9): [True: 1, False: 0]
  ------------------
 1741|      1|    aom_dc_128_predictor_64x64 = aom_dc_128_predictor_64x64_sse2;
 1742|      1|    if (flags & HAS_AVX2) aom_dc_128_predictor_64x64 = aom_dc_128_predictor_64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1742:9): [True: 1, False: 0]
  ------------------
 1743|      1|    aom_dc_left_predictor_32x16 = aom_dc_left_predictor_32x16_sse2;
 1744|      1|    if (flags & HAS_AVX2) aom_dc_left_predictor_32x16 = aom_dc_left_predictor_32x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1744:9): [True: 1, False: 0]
  ------------------
 1745|      1|    aom_dc_left_predictor_32x32 = aom_dc_left_predictor_32x32_sse2;
 1746|      1|    if (flags & HAS_AVX2) aom_dc_left_predictor_32x32 = aom_dc_left_predictor_32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1746:9): [True: 1, False: 0]
  ------------------
 1747|      1|    aom_dc_left_predictor_32x64 = aom_dc_left_predictor_32x64_sse2;
 1748|      1|    if (flags & HAS_AVX2) aom_dc_left_predictor_32x64 = aom_dc_left_predictor_32x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1748:9): [True: 1, False: 0]
  ------------------
 1749|      1|    aom_dc_left_predictor_64x16 = aom_dc_left_predictor_64x16_sse2;
 1750|      1|    if (flags & HAS_AVX2) aom_dc_left_predictor_64x16 = aom_dc_left_predictor_64x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1750:9): [True: 1, False: 0]
  ------------------
 1751|      1|    aom_dc_left_predictor_64x32 = aom_dc_left_predictor_64x32_sse2;
 1752|      1|    if (flags & HAS_AVX2) aom_dc_left_predictor_64x32 = aom_dc_left_predictor_64x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1752:9): [True: 1, False: 0]
  ------------------
 1753|      1|    aom_dc_left_predictor_64x64 = aom_dc_left_predictor_64x64_sse2;
 1754|      1|    if (flags & HAS_AVX2) aom_dc_left_predictor_64x64 = aom_dc_left_predictor_64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1754:9): [True: 1, False: 0]
  ------------------
 1755|      1|    aom_dc_predictor_32x16 = aom_dc_predictor_32x16_sse2;
 1756|      1|    if (flags & HAS_AVX2) aom_dc_predictor_32x16 = aom_dc_predictor_32x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1756:9): [True: 1, False: 0]
  ------------------
 1757|      1|    aom_dc_predictor_32x32 = aom_dc_predictor_32x32_sse2;
 1758|      1|    if (flags & HAS_AVX2) aom_dc_predictor_32x32 = aom_dc_predictor_32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1758:9): [True: 1, False: 0]
  ------------------
 1759|      1|    aom_dc_predictor_32x64 = aom_dc_predictor_32x64_sse2;
 1760|      1|    if (flags & HAS_AVX2) aom_dc_predictor_32x64 = aom_dc_predictor_32x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1760:9): [True: 1, False: 0]
  ------------------
 1761|      1|    aom_dc_predictor_64x16 = aom_dc_predictor_64x16_sse2;
 1762|      1|    if (flags & HAS_AVX2) aom_dc_predictor_64x16 = aom_dc_predictor_64x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1762:9): [True: 1, False: 0]
  ------------------
 1763|      1|    aom_dc_predictor_64x32 = aom_dc_predictor_64x32_sse2;
 1764|      1|    if (flags & HAS_AVX2) aom_dc_predictor_64x32 = aom_dc_predictor_64x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1764:9): [True: 1, False: 0]
  ------------------
 1765|      1|    aom_dc_predictor_64x64 = aom_dc_predictor_64x64_sse2;
 1766|      1|    if (flags & HAS_AVX2) aom_dc_predictor_64x64 = aom_dc_predictor_64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1766:9): [True: 1, False: 0]
  ------------------
 1767|      1|    aom_dc_top_predictor_32x16 = aom_dc_top_predictor_32x16_sse2;
 1768|      1|    if (flags & HAS_AVX2) aom_dc_top_predictor_32x16 = aom_dc_top_predictor_32x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1768:9): [True: 1, False: 0]
  ------------------
 1769|      1|    aom_dc_top_predictor_32x32 = aom_dc_top_predictor_32x32_sse2;
 1770|      1|    if (flags & HAS_AVX2) aom_dc_top_predictor_32x32 = aom_dc_top_predictor_32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1770:9): [True: 1, False: 0]
  ------------------
 1771|      1|    aom_dc_top_predictor_32x64 = aom_dc_top_predictor_32x64_sse2;
 1772|      1|    if (flags & HAS_AVX2) aom_dc_top_predictor_32x64 = aom_dc_top_predictor_32x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1772:9): [True: 1, False: 0]
  ------------------
 1773|      1|    aom_dc_top_predictor_64x16 = aom_dc_top_predictor_64x16_sse2;
 1774|      1|    if (flags & HAS_AVX2) aom_dc_top_predictor_64x16 = aom_dc_top_predictor_64x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1774:9): [True: 1, False: 0]
  ------------------
 1775|      1|    aom_dc_top_predictor_64x32 = aom_dc_top_predictor_64x32_sse2;
 1776|      1|    if (flags & HAS_AVX2) aom_dc_top_predictor_64x32 = aom_dc_top_predictor_64x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1776:9): [True: 1, False: 0]
  ------------------
 1777|      1|    aom_dc_top_predictor_64x64 = aom_dc_top_predictor_64x64_sse2;
 1778|      1|    if (flags & HAS_AVX2) aom_dc_top_predictor_64x64 = aom_dc_top_predictor_64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1778:9): [True: 1, False: 0]
  ------------------
 1779|      1|    aom_h_predictor_32x32 = aom_h_predictor_32x32_sse2;
 1780|      1|    if (flags & HAS_AVX2) aom_h_predictor_32x32 = aom_h_predictor_32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1780:9): [True: 1, False: 0]
  ------------------
 1781|      1|    aom_highbd_blend_a64_d16_mask = aom_highbd_blend_a64_d16_mask_c;
 1782|      1|    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_d16_mask = aom_highbd_blend_a64_d16_mask_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (1782:9): [True: 1, False: 0]
  ------------------
 1783|      1|    if (flags & HAS_AVX2) aom_highbd_blend_a64_d16_mask = aom_highbd_blend_a64_d16_mask_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1783:9): [True: 1, False: 0]
  ------------------
 1784|      1|    aom_highbd_blend_a64_hmask = aom_highbd_blend_a64_hmask_c;
 1785|      1|    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_hmask = aom_highbd_blend_a64_hmask_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (1785:9): [True: 1, False: 0]
  ------------------
 1786|      1|    aom_highbd_blend_a64_mask = aom_highbd_blend_a64_mask_c;
 1787|      1|    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_mask = aom_highbd_blend_a64_mask_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (1787:9): [True: 1, False: 0]
  ------------------
 1788|      1|    aom_highbd_blend_a64_vmask = aom_highbd_blend_a64_vmask_c;
 1789|      1|    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_vmask = aom_highbd_blend_a64_vmask_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (1789:9): [True: 1, False: 0]
  ------------------
 1790|      1|    aom_highbd_convolve8_horiz = aom_highbd_convolve8_horiz_sse2;
 1791|      1|    if (flags & HAS_AVX2) aom_highbd_convolve8_horiz = aom_highbd_convolve8_horiz_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1791:9): [True: 1, False: 0]
  ------------------
 1792|      1|    aom_highbd_convolve8_vert = aom_highbd_convolve8_vert_sse2;
 1793|      1|    if (flags & HAS_AVX2) aom_highbd_convolve8_vert = aom_highbd_convolve8_vert_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1793:9): [True: 1, False: 0]
  ------------------
 1794|      1|    aom_highbd_convolve_copy = aom_highbd_convolve_copy_sse2;
 1795|      1|    if (flags & HAS_AVX2) aom_highbd_convolve_copy = aom_highbd_convolve_copy_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1795:9): [True: 1, False: 0]
  ------------------
 1796|      1|    aom_highbd_lpf_horizontal_14_dual = aom_highbd_lpf_horizontal_14_dual_sse2;
 1797|      1|    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_14_dual = aom_highbd_lpf_horizontal_14_dual_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1797:9): [True: 1, False: 0]
  ------------------
 1798|      1|    aom_highbd_lpf_horizontal_4_dual = aom_highbd_lpf_horizontal_4_dual_sse2;
 1799|      1|    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_4_dual = aom_highbd_lpf_horizontal_4_dual_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1799:9): [True: 1, False: 0]
  ------------------
 1800|      1|    aom_highbd_lpf_horizontal_8_dual = aom_highbd_lpf_horizontal_8_dual_sse2;
 1801|      1|    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_8_dual = aom_highbd_lpf_horizontal_8_dual_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1801:9): [True: 1, False: 0]
  ------------------
 1802|      1|    aom_highbd_lpf_vertical_14_dual = aom_highbd_lpf_vertical_14_dual_sse2;
 1803|      1|    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_14_dual = aom_highbd_lpf_vertical_14_dual_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1803:9): [True: 1, False: 0]
  ------------------
 1804|      1|    aom_highbd_lpf_vertical_4_dual = aom_highbd_lpf_vertical_4_dual_sse2;
 1805|      1|    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_4_dual = aom_highbd_lpf_vertical_4_dual_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1805:9): [True: 1, False: 0]
  ------------------
 1806|      1|    aom_highbd_lpf_vertical_8_dual = aom_highbd_lpf_vertical_8_dual_sse2;
 1807|      1|    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_8_dual = aom_highbd_lpf_vertical_8_dual_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1807:9): [True: 1, False: 0]
  ------------------
 1808|      1|    aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_c;
 1809|      1|    if (flags & HAS_SSE4_1) aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (1809:9): [True: 1, False: 0]
  ------------------
 1810|      1|    if (flags & HAS_AVX2) aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1810:9): [True: 1, False: 0]
  ------------------
 1811|      1|    aom_lpf_horizontal_14_quad = aom_lpf_horizontal_14_quad_sse2;
 1812|      1|    if (flags & HAS_AVX2) aom_lpf_horizontal_14_quad = aom_lpf_horizontal_14_quad_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1812:9): [True: 1, False: 0]
  ------------------
 1813|      1|    aom_lpf_horizontal_6_quad = aom_lpf_horizontal_6_quad_sse2;
 1814|      1|    if (flags & HAS_AVX2) aom_lpf_horizontal_6_quad = aom_lpf_horizontal_6_quad_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1814:9): [True: 1, False: 0]
  ------------------
 1815|      1|    aom_lpf_horizontal_8_quad = aom_lpf_horizontal_8_quad_sse2;
 1816|      1|    if (flags & HAS_AVX2) aom_lpf_horizontal_8_quad = aom_lpf_horizontal_8_quad_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1816:9): [True: 1, False: 0]
  ------------------
 1817|      1|    aom_lpf_vertical_14_quad = aom_lpf_vertical_14_quad_sse2;
 1818|      1|    if (flags & HAS_AVX2) aom_lpf_vertical_14_quad = aom_lpf_vertical_14_quad_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1818:9): [True: 1, False: 0]
  ------------------
 1819|      1|    aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_c;
 1820|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1820:9): [True: 1, False: 0]
  ------------------
 1821|      1|    if (flags & HAS_AVX2) aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1821:9): [True: 1, False: 0]
  ------------------
 1822|      1|    aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_c;
 1823|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1823:9): [True: 1, False: 0]
  ------------------
 1824|      1|    if (flags & HAS_AVX2) aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1824:9): [True: 1, False: 0]
  ------------------
 1825|      1|    aom_paeth_predictor_16x4 = aom_paeth_predictor_16x4_c;
 1826|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_16x4 = aom_paeth_predictor_16x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1826:9): [True: 1, False: 0]
  ------------------
 1827|      1|    aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_c;
 1828|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1828:9): [True: 1, False: 0]
  ------------------
 1829|      1|    if (flags & HAS_AVX2) aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1829:9): [True: 1, False: 0]
  ------------------
 1830|      1|    aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_c;
 1831|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1831:9): [True: 1, False: 0]
  ------------------
 1832|      1|    if (flags & HAS_AVX2) aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1832:9): [True: 1, False: 0]
  ------------------
 1833|      1|    aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_c;
 1834|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1834:9): [True: 1, False: 0]
  ------------------
 1835|      1|    if (flags & HAS_AVX2) aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1835:9): [True: 1, False: 0]
  ------------------
 1836|      1|    aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_c;
 1837|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1837:9): [True: 1, False: 0]
  ------------------
 1838|      1|    if (flags & HAS_AVX2) aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1838:9): [True: 1, False: 0]
  ------------------
 1839|      1|    aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_c;
 1840|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1840:9): [True: 1, False: 0]
  ------------------
 1841|      1|    if (flags & HAS_AVX2) aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1841:9): [True: 1, False: 0]
  ------------------
 1842|      1|    aom_paeth_predictor_32x8 = aom_paeth_predictor_32x8_c;
 1843|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_32x8 = aom_paeth_predictor_32x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1843:9): [True: 1, False: 0]
  ------------------
 1844|      1|    aom_paeth_predictor_4x16 = aom_paeth_predictor_4x16_c;
 1845|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_4x16 = aom_paeth_predictor_4x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1845:9): [True: 1, False: 0]
  ------------------
 1846|      1|    aom_paeth_predictor_4x4 = aom_paeth_predictor_4x4_c;
 1847|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_4x4 = aom_paeth_predictor_4x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1847:9): [True: 1, False: 0]
  ------------------
 1848|      1|    aom_paeth_predictor_4x8 = aom_paeth_predictor_4x8_c;
 1849|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_4x8 = aom_paeth_predictor_4x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1849:9): [True: 1, False: 0]
  ------------------
 1850|      1|    aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_c;
 1851|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1851:9): [True: 1, False: 0]
  ------------------
 1852|      1|    if (flags & HAS_AVX2) aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1852:9): [True: 1, False: 0]
  ------------------
 1853|      1|    aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_c;
 1854|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1854:9): [True: 1, False: 0]
  ------------------
 1855|      1|    if (flags & HAS_AVX2) aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1855:9): [True: 1, False: 0]
  ------------------
 1856|      1|    aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_c;
 1857|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1857:9): [True: 1, False: 0]
  ------------------
 1858|      1|    if (flags & HAS_AVX2) aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1858:9): [True: 1, False: 0]
  ------------------
 1859|      1|    aom_paeth_predictor_8x16 = aom_paeth_predictor_8x16_c;
 1860|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_8x16 = aom_paeth_predictor_8x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1860:9): [True: 1, False: 0]
  ------------------
 1861|      1|    aom_paeth_predictor_8x32 = aom_paeth_predictor_8x32_c;
 1862|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_8x32 = aom_paeth_predictor_8x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1862:9): [True: 1, False: 0]
  ------------------
 1863|      1|    aom_paeth_predictor_8x4 = aom_paeth_predictor_8x4_c;
 1864|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_8x4 = aom_paeth_predictor_8x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1864:9): [True: 1, False: 0]
  ------------------
 1865|      1|    aom_paeth_predictor_8x8 = aom_paeth_predictor_8x8_c;
 1866|      1|    if (flags & HAS_SSSE3) aom_paeth_predictor_8x8 = aom_paeth_predictor_8x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1866:9): [True: 1, False: 0]
  ------------------
 1867|      1|    aom_scaled_2d = aom_scaled_2d_c;
 1868|      1|    if (flags & HAS_SSSE3) aom_scaled_2d = aom_scaled_2d_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1868:9): [True: 1, False: 0]
  ------------------
 1869|      1|    aom_smooth_h_predictor_16x16 = aom_smooth_h_predictor_16x16_c;
 1870|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x16 = aom_smooth_h_predictor_16x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1870:9): [True: 1, False: 0]
  ------------------
 1871|      1|    aom_smooth_h_predictor_16x32 = aom_smooth_h_predictor_16x32_c;
 1872|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x32 = aom_smooth_h_predictor_16x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1872:9): [True: 1, False: 0]
  ------------------
 1873|      1|    aom_smooth_h_predictor_16x4 = aom_smooth_h_predictor_16x4_c;
 1874|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x4 = aom_smooth_h_predictor_16x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1874:9): [True: 1, False: 0]
  ------------------
 1875|      1|    aom_smooth_h_predictor_16x64 = aom_smooth_h_predictor_16x64_c;
 1876|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x64 = aom_smooth_h_predictor_16x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1876:9): [True: 1, False: 0]
  ------------------
 1877|      1|    aom_smooth_h_predictor_16x8 = aom_smooth_h_predictor_16x8_c;
 1878|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x8 = aom_smooth_h_predictor_16x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1878:9): [True: 1, False: 0]
  ------------------
 1879|      1|    aom_smooth_h_predictor_32x16 = aom_smooth_h_predictor_32x16_c;
 1880|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x16 = aom_smooth_h_predictor_32x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1880:9): [True: 1, False: 0]
  ------------------
 1881|      1|    aom_smooth_h_predictor_32x32 = aom_smooth_h_predictor_32x32_c;
 1882|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x32 = aom_smooth_h_predictor_32x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1882:9): [True: 1, False: 0]
  ------------------
 1883|      1|    aom_smooth_h_predictor_32x64 = aom_smooth_h_predictor_32x64_c;
 1884|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x64 = aom_smooth_h_predictor_32x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1884:9): [True: 1, False: 0]
  ------------------
 1885|      1|    aom_smooth_h_predictor_32x8 = aom_smooth_h_predictor_32x8_c;
 1886|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x8 = aom_smooth_h_predictor_32x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1886:9): [True: 1, False: 0]
  ------------------
 1887|      1|    aom_smooth_h_predictor_4x16 = aom_smooth_h_predictor_4x16_c;
 1888|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x16 = aom_smooth_h_predictor_4x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1888:9): [True: 1, False: 0]
  ------------------
 1889|      1|    aom_smooth_h_predictor_4x4 = aom_smooth_h_predictor_4x4_c;
 1890|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x4 = aom_smooth_h_predictor_4x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1890:9): [True: 1, False: 0]
  ------------------
 1891|      1|    aom_smooth_h_predictor_4x8 = aom_smooth_h_predictor_4x8_c;
 1892|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x8 = aom_smooth_h_predictor_4x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1892:9): [True: 1, False: 0]
  ------------------
 1893|      1|    aom_smooth_h_predictor_64x16 = aom_smooth_h_predictor_64x16_c;
 1894|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x16 = aom_smooth_h_predictor_64x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1894:9): [True: 1, False: 0]
  ------------------
 1895|      1|    aom_smooth_h_predictor_64x32 = aom_smooth_h_predictor_64x32_c;
 1896|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x32 = aom_smooth_h_predictor_64x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1896:9): [True: 1, False: 0]
  ------------------
 1897|      1|    aom_smooth_h_predictor_64x64 = aom_smooth_h_predictor_64x64_c;
 1898|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x64 = aom_smooth_h_predictor_64x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1898:9): [True: 1, False: 0]
  ------------------
 1899|      1|    aom_smooth_h_predictor_8x16 = aom_smooth_h_predictor_8x16_c;
 1900|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x16 = aom_smooth_h_predictor_8x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1900:9): [True: 1, False: 0]
  ------------------
 1901|      1|    aom_smooth_h_predictor_8x32 = aom_smooth_h_predictor_8x32_c;
 1902|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x32 = aom_smooth_h_predictor_8x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1902:9): [True: 1, False: 0]
  ------------------
 1903|      1|    aom_smooth_h_predictor_8x4 = aom_smooth_h_predictor_8x4_c;
 1904|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x4 = aom_smooth_h_predictor_8x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1904:9): [True: 1, False: 0]
  ------------------
 1905|      1|    aom_smooth_h_predictor_8x8 = aom_smooth_h_predictor_8x8_c;
 1906|      1|    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x8 = aom_smooth_h_predictor_8x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1906:9): [True: 1, False: 0]
  ------------------
 1907|      1|    aom_smooth_predictor_16x16 = aom_smooth_predictor_16x16_c;
 1908|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_16x16 = aom_smooth_predictor_16x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1908:9): [True: 1, False: 0]
  ------------------
 1909|      1|    aom_smooth_predictor_16x32 = aom_smooth_predictor_16x32_c;
 1910|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_16x32 = aom_smooth_predictor_16x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1910:9): [True: 1, False: 0]
  ------------------
 1911|      1|    aom_smooth_predictor_16x4 = aom_smooth_predictor_16x4_c;
 1912|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_16x4 = aom_smooth_predictor_16x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1912:9): [True: 1, False: 0]
  ------------------
 1913|      1|    aom_smooth_predictor_16x64 = aom_smooth_predictor_16x64_c;
 1914|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_16x64 = aom_smooth_predictor_16x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1914:9): [True: 1, False: 0]
  ------------------
 1915|      1|    aom_smooth_predictor_16x8 = aom_smooth_predictor_16x8_c;
 1916|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_16x8 = aom_smooth_predictor_16x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1916:9): [True: 1, False: 0]
  ------------------
 1917|      1|    aom_smooth_predictor_32x16 = aom_smooth_predictor_32x16_c;
 1918|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_32x16 = aom_smooth_predictor_32x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1918:9): [True: 1, False: 0]
  ------------------
 1919|      1|    aom_smooth_predictor_32x32 = aom_smooth_predictor_32x32_c;
 1920|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_32x32 = aom_smooth_predictor_32x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1920:9): [True: 1, False: 0]
  ------------------
 1921|      1|    aom_smooth_predictor_32x64 = aom_smooth_predictor_32x64_c;
 1922|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_32x64 = aom_smooth_predictor_32x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1922:9): [True: 1, False: 0]
  ------------------
 1923|      1|    aom_smooth_predictor_32x8 = aom_smooth_predictor_32x8_c;
 1924|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_32x8 = aom_smooth_predictor_32x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1924:9): [True: 1, False: 0]
  ------------------
 1925|      1|    aom_smooth_predictor_4x16 = aom_smooth_predictor_4x16_c;
 1926|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_4x16 = aom_smooth_predictor_4x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1926:9): [True: 1, False: 0]
  ------------------
 1927|      1|    aom_smooth_predictor_4x4 = aom_smooth_predictor_4x4_c;
 1928|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_4x4 = aom_smooth_predictor_4x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1928:9): [True: 1, False: 0]
  ------------------
 1929|      1|    aom_smooth_predictor_4x8 = aom_smooth_predictor_4x8_c;
 1930|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_4x8 = aom_smooth_predictor_4x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1930:9): [True: 1, False: 0]
  ------------------
 1931|      1|    aom_smooth_predictor_64x16 = aom_smooth_predictor_64x16_c;
 1932|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_64x16 = aom_smooth_predictor_64x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1932:9): [True: 1, False: 0]
  ------------------
 1933|      1|    aom_smooth_predictor_64x32 = aom_smooth_predictor_64x32_c;
 1934|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_64x32 = aom_smooth_predictor_64x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1934:9): [True: 1, False: 0]
  ------------------
 1935|      1|    aom_smooth_predictor_64x64 = aom_smooth_predictor_64x64_c;
 1936|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_64x64 = aom_smooth_predictor_64x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1936:9): [True: 1, False: 0]
  ------------------
 1937|      1|    aom_smooth_predictor_8x16 = aom_smooth_predictor_8x16_c;
 1938|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_8x16 = aom_smooth_predictor_8x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1938:9): [True: 1, False: 0]
  ------------------
 1939|      1|    aom_smooth_predictor_8x32 = aom_smooth_predictor_8x32_c;
 1940|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_8x32 = aom_smooth_predictor_8x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1940:9): [True: 1, False: 0]
  ------------------
 1941|      1|    aom_smooth_predictor_8x4 = aom_smooth_predictor_8x4_c;
 1942|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_8x4 = aom_smooth_predictor_8x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1942:9): [True: 1, False: 0]
  ------------------
 1943|      1|    aom_smooth_predictor_8x8 = aom_smooth_predictor_8x8_c;
 1944|      1|    if (flags & HAS_SSSE3) aom_smooth_predictor_8x8 = aom_smooth_predictor_8x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1944:9): [True: 1, False: 0]
  ------------------
 1945|      1|    aom_smooth_v_predictor_16x16 = aom_smooth_v_predictor_16x16_c;
 1946|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x16 = aom_smooth_v_predictor_16x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1946:9): [True: 1, False: 0]
  ------------------
 1947|      1|    aom_smooth_v_predictor_16x32 = aom_smooth_v_predictor_16x32_c;
 1948|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x32 = aom_smooth_v_predictor_16x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1948:9): [True: 1, False: 0]
  ------------------
 1949|      1|    aom_smooth_v_predictor_16x4 = aom_smooth_v_predictor_16x4_c;
 1950|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x4 = aom_smooth_v_predictor_16x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1950:9): [True: 1, False: 0]
  ------------------
 1951|      1|    aom_smooth_v_predictor_16x64 = aom_smooth_v_predictor_16x64_c;
 1952|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x64 = aom_smooth_v_predictor_16x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1952:9): [True: 1, False: 0]
  ------------------
 1953|      1|    aom_smooth_v_predictor_16x8 = aom_smooth_v_predictor_16x8_c;
 1954|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x8 = aom_smooth_v_predictor_16x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1954:9): [True: 1, False: 0]
  ------------------
 1955|      1|    aom_smooth_v_predictor_32x16 = aom_smooth_v_predictor_32x16_c;
 1956|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x16 = aom_smooth_v_predictor_32x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1956:9): [True: 1, False: 0]
  ------------------
 1957|      1|    aom_smooth_v_predictor_32x32 = aom_smooth_v_predictor_32x32_c;
 1958|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x32 = aom_smooth_v_predictor_32x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1958:9): [True: 1, False: 0]
  ------------------
 1959|      1|    aom_smooth_v_predictor_32x64 = aom_smooth_v_predictor_32x64_c;
 1960|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x64 = aom_smooth_v_predictor_32x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1960:9): [True: 1, False: 0]
  ------------------
 1961|      1|    aom_smooth_v_predictor_32x8 = aom_smooth_v_predictor_32x8_c;
 1962|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x8 = aom_smooth_v_predictor_32x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1962:9): [True: 1, False: 0]
  ------------------
 1963|      1|    aom_smooth_v_predictor_4x16 = aom_smooth_v_predictor_4x16_c;
 1964|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x16 = aom_smooth_v_predictor_4x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1964:9): [True: 1, False: 0]
  ------------------
 1965|      1|    aom_smooth_v_predictor_4x4 = aom_smooth_v_predictor_4x4_c;
 1966|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x4 = aom_smooth_v_predictor_4x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1966:9): [True: 1, False: 0]
  ------------------
 1967|      1|    aom_smooth_v_predictor_4x8 = aom_smooth_v_predictor_4x8_c;
 1968|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x8 = aom_smooth_v_predictor_4x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1968:9): [True: 1, False: 0]
  ------------------
 1969|      1|    aom_smooth_v_predictor_64x16 = aom_smooth_v_predictor_64x16_c;
 1970|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x16 = aom_smooth_v_predictor_64x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1970:9): [True: 1, False: 0]
  ------------------
 1971|      1|    aom_smooth_v_predictor_64x32 = aom_smooth_v_predictor_64x32_c;
 1972|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x32 = aom_smooth_v_predictor_64x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1972:9): [True: 1, False: 0]
  ------------------
 1973|      1|    aom_smooth_v_predictor_64x64 = aom_smooth_v_predictor_64x64_c;
 1974|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x64 = aom_smooth_v_predictor_64x64_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1974:9): [True: 1, False: 0]
  ------------------
 1975|      1|    aom_smooth_v_predictor_8x16 = aom_smooth_v_predictor_8x16_c;
 1976|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x16 = aom_smooth_v_predictor_8x16_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1976:9): [True: 1, False: 0]
  ------------------
 1977|      1|    aom_smooth_v_predictor_8x32 = aom_smooth_v_predictor_8x32_c;
 1978|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x32 = aom_smooth_v_predictor_8x32_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1978:9): [True: 1, False: 0]
  ------------------
 1979|      1|    aom_smooth_v_predictor_8x4 = aom_smooth_v_predictor_8x4_c;
 1980|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x4 = aom_smooth_v_predictor_8x4_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1980:9): [True: 1, False: 0]
  ------------------
 1981|      1|    aom_smooth_v_predictor_8x8 = aom_smooth_v_predictor_8x8_c;
 1982|      1|    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x8 = aom_smooth_v_predictor_8x8_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (1982:9): [True: 1, False: 0]
  ------------------
 1983|      1|    aom_v_predictor_32x16 = aom_v_predictor_32x16_sse2;
 1984|      1|    if (flags & HAS_AVX2) aom_v_predictor_32x16 = aom_v_predictor_32x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1984:9): [True: 1, False: 0]
  ------------------
 1985|      1|    aom_v_predictor_32x32 = aom_v_predictor_32x32_sse2;
 1986|      1|    if (flags & HAS_AVX2) aom_v_predictor_32x32 = aom_v_predictor_32x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1986:9): [True: 1, False: 0]
  ------------------
 1987|      1|    aom_v_predictor_32x64 = aom_v_predictor_32x64_sse2;
 1988|      1|    if (flags & HAS_AVX2) aom_v_predictor_32x64 = aom_v_predictor_32x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1988:9): [True: 1, False: 0]
  ------------------
 1989|      1|    aom_v_predictor_64x16 = aom_v_predictor_64x16_sse2;
 1990|      1|    if (flags & HAS_AVX2) aom_v_predictor_64x16 = aom_v_predictor_64x16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1990:9): [True: 1, False: 0]
  ------------------
 1991|      1|    aom_v_predictor_64x32 = aom_v_predictor_64x32_sse2;
 1992|      1|    if (flags & HAS_AVX2) aom_v_predictor_64x32 = aom_v_predictor_64x32_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1992:9): [True: 1, False: 0]
  ------------------
 1993|      1|    aom_v_predictor_64x64 = aom_v_predictor_64x64_sse2;
 1994|      1|    if (flags & HAS_AVX2) aom_v_predictor_64x64 = aom_v_predictor_64x64_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (1994:9): [True: 1, False: 0]
  ------------------
 1995|      1|}

aom_scale_rtcd.c:setup_rtcd_internal:
   77|      1|{
   78|      1|    int flags = x86_simd_caps();
   79|       |
   80|      1|    (void)flags;
   81|       |
   82|      1|}

av1_rtcd.c:setup_rtcd_internal:
  516|      1|{
  517|      1|    int flags = x86_simd_caps();
  518|       |
  519|      1|    (void)flags;
  520|       |
  521|      1|    av1_apply_selfguided_restoration = av1_apply_selfguided_restoration_c;
  522|      1|    if (flags & HAS_SSE4_1) av1_apply_selfguided_restoration = av1_apply_selfguided_restoration_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (522:9): [True: 1, False: 0]
  ------------------
  523|      1|    if (flags & HAS_AVX2) av1_apply_selfguided_restoration = av1_apply_selfguided_restoration_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (523:9): [True: 1, False: 0]
  ------------------
  524|      1|    av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_c;
  525|      1|    if (flags & HAS_SSE4_1) av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (525:9): [True: 1, False: 0]
  ------------------
  526|      1|    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (526:9): [True: 1, False: 0]
  ------------------
  527|      1|    av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_c;
  528|      1|    if (flags & HAS_SSE4_1) av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (528:9): [True: 1, False: 0]
  ------------------
  529|      1|    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (529:9): [True: 1, False: 0]
  ------------------
  530|      1|    av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_c;
  531|      1|    if (flags & HAS_SSSE3) av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (531:9): [True: 1, False: 0]
  ------------------
  532|      1|    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (532:9): [True: 1, False: 0]
  ------------------
  533|      1|    av1_convolve_2d_scale = av1_convolve_2d_scale_c;
  534|      1|    if (flags & HAS_SSE4_1) av1_convolve_2d_scale = av1_convolve_2d_scale_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (534:9): [True: 1, False: 0]
  ------------------
  535|      1|    av1_convolve_2d_sr = av1_convolve_2d_sr_sse2;
  536|      1|    if (flags & HAS_AVX2) av1_convolve_2d_sr = av1_convolve_2d_sr_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (536:9): [True: 1, False: 0]
  ------------------
  537|      1|    av1_convolve_horiz_rs = av1_convolve_horiz_rs_c;
  538|      1|    if (flags & HAS_SSE4_1) av1_convolve_horiz_rs = av1_convolve_horiz_rs_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (538:9): [True: 1, False: 0]
  ------------------
  539|      1|    av1_convolve_x_sr = av1_convolve_x_sr_sse2;
  540|      1|    if (flags & HAS_AVX2) av1_convolve_x_sr = av1_convolve_x_sr_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (540:9): [True: 1, False: 0]
  ------------------
  541|      1|    av1_convolve_y_sr = av1_convolve_y_sr_sse2;
  542|      1|    if (flags & HAS_AVX2) av1_convolve_y_sr = av1_convolve_y_sr_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (542:9): [True: 1, False: 0]
  ------------------
  543|      1|    av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_c;
  544|      1|    if (flags & HAS_SSSE3) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (544:9): [True: 1, False: 0]
  ------------------
  545|      1|    if (flags & HAS_AVX2) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (545:9): [True: 1, False: 0]
  ------------------
  546|      1|    av1_dist_wtd_convolve_2d_copy = av1_dist_wtd_convolve_2d_copy_sse2;
  547|      1|    if (flags & HAS_AVX2) av1_dist_wtd_convolve_2d_copy = av1_dist_wtd_convolve_2d_copy_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (547:9): [True: 1, False: 0]
  ------------------
  548|      1|    av1_dist_wtd_convolve_x = av1_dist_wtd_convolve_x_sse2;
  549|      1|    if (flags & HAS_AVX2) av1_dist_wtd_convolve_x = av1_dist_wtd_convolve_x_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (549:9): [True: 1, False: 0]
  ------------------
  550|      1|    av1_dist_wtd_convolve_y = av1_dist_wtd_convolve_y_sse2;
  551|      1|    if (flags & HAS_AVX2) av1_dist_wtd_convolve_y = av1_dist_wtd_convolve_y_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (551:9): [True: 1, False: 0]
  ------------------
  552|      1|    av1_dr_prediction_z1 = av1_dr_prediction_z1_c;
  553|      1|    if (flags & HAS_SSE4_1) av1_dr_prediction_z1 = av1_dr_prediction_z1_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (553:9): [True: 1, False: 0]
  ------------------
  554|      1|    if (flags & HAS_AVX2) av1_dr_prediction_z1 = av1_dr_prediction_z1_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (554:9): [True: 1, False: 0]
  ------------------
  555|      1|    av1_dr_prediction_z2 = av1_dr_prediction_z2_c;
  556|      1|    if (flags & HAS_SSE4_1) av1_dr_prediction_z2 = av1_dr_prediction_z2_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (556:9): [True: 1, False: 0]
  ------------------
  557|      1|    if (flags & HAS_AVX2) av1_dr_prediction_z2 = av1_dr_prediction_z2_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (557:9): [True: 1, False: 0]
  ------------------
  558|      1|    av1_dr_prediction_z3 = av1_dr_prediction_z3_c;
  559|      1|    if (flags & HAS_SSE4_1) av1_dr_prediction_z3 = av1_dr_prediction_z3_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (559:9): [True: 1, False: 0]
  ------------------
  560|      1|    if (flags & HAS_AVX2) av1_dr_prediction_z3 = av1_dr_prediction_z3_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (560:9): [True: 1, False: 0]
  ------------------
  561|      1|    av1_filter_intra_edge = av1_filter_intra_edge_c;
  562|      1|    if (flags & HAS_SSE4_1) av1_filter_intra_edge = av1_filter_intra_edge_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (562:9): [True: 1, False: 0]
  ------------------
  563|      1|    av1_filter_intra_predictor = av1_filter_intra_predictor_c;
  564|      1|    if (flags & HAS_SSE4_1) av1_filter_intra_predictor = av1_filter_intra_predictor_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (564:9): [True: 1, False: 0]
  ------------------
  565|      1|    av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_c;
  566|      1|    if (flags & HAS_SSE4_1) av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (566:9): [True: 1, False: 0]
  ------------------
  567|      1|    av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_c;
  568|      1|    if (flags & HAS_SSSE3) av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (568:9): [True: 1, False: 0]
  ------------------
  569|      1|    if (flags & HAS_AVX2) av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (569:9): [True: 1, False: 0]
  ------------------
  570|      1|    av1_highbd_convolve_horiz_rs = av1_highbd_convolve_horiz_rs_c;
  571|      1|    if (flags & HAS_SSE4_1) av1_highbd_convolve_horiz_rs = av1_highbd_convolve_horiz_rs_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (571:9): [True: 1, False: 0]
  ------------------
  572|      1|    av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_c;
  573|      1|    if (flags & HAS_SSSE3) av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (573:9): [True: 1, False: 0]
  ------------------
  574|      1|    if (flags & HAS_AVX2) av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (574:9): [True: 1, False: 0]
  ------------------
  575|      1|    av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_c;
  576|      1|    if (flags & HAS_SSSE3) av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (576:9): [True: 1, False: 0]
  ------------------
  577|      1|    if (flags & HAS_AVX2) av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (577:9): [True: 1, False: 0]
  ------------------
  578|      1|    av1_highbd_dist_wtd_convolve_2d = av1_highbd_dist_wtd_convolve_2d_c;
  579|      1|    if (flags & HAS_SSE4_1) av1_highbd_dist_wtd_convolve_2d = av1_highbd_dist_wtd_convolve_2d_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (579:9): [True: 1, False: 0]
  ------------------
  580|      1|    if (flags & HAS_AVX2) av1_highbd_dist_wtd_convolve_2d = av1_highbd_dist_wtd_convolve_2d_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (580:9): [True: 1, False: 0]
  ------------------
  581|      1|    av1_highbd_dist_wtd_convolve_2d_copy = av1_highbd_dist_wtd_convolve_2d_copy_c;
  582|      1|    if (flags & HAS_SSE4_1) av1_highbd_dist_wtd_convolve_2d_copy = av1_highbd_dist_wtd_convolve_2d_copy_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (582:9): [True: 1, False: 0]
  ------------------
  583|      1|    if (flags & HAS_AVX2) av1_highbd_dist_wtd_convolve_2d_copy = av1_highbd_dist_wtd_convolve_2d_copy_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (583:9): [True: 1, False: 0]
  ------------------
  584|      1|    av1_highbd_dist_wtd_convolve_x = av1_highbd_dist_wtd_convolve_x_c;
  585|      1|    if (flags & HAS_SSE4_1) av1_highbd_dist_wtd_convolve_x = av1_highbd_dist_wtd_convolve_x_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (585:9): [True: 1, False: 0]
  ------------------
  586|      1|    if (flags & HAS_AVX2) av1_highbd_dist_wtd_convolve_x = av1_highbd_dist_wtd_convolve_x_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (586:9): [True: 1, False: 0]
  ------------------
  587|      1|    av1_highbd_dist_wtd_convolve_y = av1_highbd_dist_wtd_convolve_y_c;
  588|      1|    if (flags & HAS_SSE4_1) av1_highbd_dist_wtd_convolve_y = av1_highbd_dist_wtd_convolve_y_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (588:9): [True: 1, False: 0]
  ------------------
  589|      1|    if (flags & HAS_AVX2) av1_highbd_dist_wtd_convolve_y = av1_highbd_dist_wtd_convolve_y_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (589:9): [True: 1, False: 0]
  ------------------
  590|      1|    av1_highbd_dr_prediction_z1 = av1_highbd_dr_prediction_z1_c;
  591|      1|    if (flags & HAS_AVX2) av1_highbd_dr_prediction_z1 = av1_highbd_dr_prediction_z1_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (591:9): [True: 1, False: 0]
  ------------------
  592|      1|    av1_highbd_dr_prediction_z2 = av1_highbd_dr_prediction_z2_c;
  593|      1|    if (flags & HAS_AVX2) av1_highbd_dr_prediction_z2 = av1_highbd_dr_prediction_z2_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (593:9): [True: 1, False: 0]
  ------------------
  594|      1|    av1_highbd_dr_prediction_z3 = av1_highbd_dr_prediction_z3_c;
  595|      1|    if (flags & HAS_AVX2) av1_highbd_dr_prediction_z3 = av1_highbd_dr_prediction_z3_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (595:9): [True: 1, False: 0]
  ------------------
  596|      1|    av1_highbd_filter_intra_edge = av1_highbd_filter_intra_edge_c;
  597|      1|    if (flags & HAS_SSE4_1) av1_highbd_filter_intra_edge = av1_highbd_filter_intra_edge_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (597:9): [True: 1, False: 0]
  ------------------
  598|      1|    av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_c;
  599|      1|    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (599:9): [True: 1, False: 0]
  ------------------
  600|      1|    if (flags & HAS_AVX2) av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (600:9): [True: 1, False: 0]
  ------------------
  601|      1|    av1_highbd_iwht4x4_16_add = av1_highbd_iwht4x4_16_add_c;
  602|      1|    if (flags & HAS_SSE4_1) av1_highbd_iwht4x4_16_add = av1_highbd_iwht4x4_16_add_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (602:9): [True: 1, False: 0]
  ------------------
  603|      1|    av1_highbd_upsample_intra_edge = av1_highbd_upsample_intra_edge_c;
  604|      1|    if (flags & HAS_SSE4_1) av1_highbd_upsample_intra_edge = av1_highbd_upsample_intra_edge_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (604:9): [True: 1, False: 0]
  ------------------
  605|      1|    av1_highbd_warp_affine = av1_highbd_warp_affine_c;
  606|      1|    if (flags & HAS_SSE4_1) av1_highbd_warp_affine = av1_highbd_warp_affine_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (606:9): [True: 1, False: 0]
  ------------------
  607|      1|    if (flags & HAS_AVX2) av1_highbd_warp_affine = av1_highbd_warp_affine_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (607:9): [True: 1, False: 0]
  ------------------
  608|      1|    av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_c;
  609|      1|    if (flags & HAS_SSSE3) av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (609:9): [True: 1, False: 0]
  ------------------
  610|      1|    if (flags & HAS_AVX2) av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (610:9): [True: 1, False: 0]
  ------------------
  611|      1|    av1_inv_txfm2d_add_4x4 = av1_inv_txfm2d_add_4x4_c;
  612|      1|    if (flags & HAS_SSE4_1) av1_inv_txfm2d_add_4x4 = av1_inv_txfm2d_add_4x4_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (612:9): [True: 1, False: 0]
  ------------------
  613|      1|    av1_inv_txfm2d_add_8x8 = av1_inv_txfm2d_add_8x8_c;
  614|      1|    if (flags & HAS_SSE4_1) av1_inv_txfm2d_add_8x8 = av1_inv_txfm2d_add_8x8_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (614:9): [True: 1, False: 0]
  ------------------
  615|      1|    av1_inv_txfm_add = av1_inv_txfm_add_c;
  616|      1|    if (flags & HAS_SSSE3) av1_inv_txfm_add = av1_inv_txfm_add_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (616:9): [True: 1, False: 0]
  ------------------
  617|      1|    if (flags & HAS_AVX2) av1_inv_txfm_add = av1_inv_txfm_add_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (617:9): [True: 1, False: 0]
  ------------------
  618|      1|    av1_resize_and_extend_frame = av1_resize_and_extend_frame_c;
  619|      1|    if (flags & HAS_SSSE3) av1_resize_and_extend_frame = av1_resize_and_extend_frame_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (619:9): [True: 1, False: 0]
  ------------------
  620|      1|    av1_resize_horz_dir = av1_resize_horz_dir_sse2;
  621|      1|    if (flags & HAS_AVX2) av1_resize_horz_dir = av1_resize_horz_dir_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (621:9): [True: 1, False: 0]
  ------------------
  622|      1|    av1_resize_vert_dir = av1_resize_vert_dir_sse2;
  623|      1|    if (flags & HAS_AVX2) av1_resize_vert_dir = av1_resize_vert_dir_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (623:9): [True: 1, False: 0]
  ------------------
  624|      1|    av1_round_shift_array = av1_round_shift_array_c;
  625|      1|    if (flags & HAS_SSE4_1) av1_round_shift_array = av1_round_shift_array_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (625:9): [True: 1, False: 0]
  ------------------
  626|      1|    av1_selfguided_restoration = av1_selfguided_restoration_c;
  627|      1|    if (flags & HAS_SSE4_1) av1_selfguided_restoration = av1_selfguided_restoration_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (627:9): [True: 1, False: 0]
  ------------------
  628|      1|    if (flags & HAS_AVX2) av1_selfguided_restoration = av1_selfguided_restoration_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (628:9): [True: 1, False: 0]
  ------------------
  629|      1|    av1_upsample_intra_edge = av1_upsample_intra_edge_c;
  630|      1|    if (flags & HAS_SSE4_1) av1_upsample_intra_edge = av1_upsample_intra_edge_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (630:9): [True: 1, False: 0]
  ------------------
  631|      1|    av1_warp_affine = av1_warp_affine_c;
  632|      1|    if (flags & HAS_SSE4_1) av1_warp_affine = av1_warp_affine_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (632:9): [True: 1, False: 0]
  ------------------
  633|      1|    if (flags & HAS_AVX2) av1_warp_affine = av1_warp_affine_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (633:9): [True: 1, False: 0]
  ------------------
  634|      1|    av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_sse2;
  635|      1|    if (flags & HAS_AVX2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (635:9): [True: 1, False: 0]
  ------------------
  636|      1|    cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_c;
  637|      1|    if (flags & HAS_SSE4_1) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (637:9): [True: 1, False: 0]
  ------------------
  638|      1|    if (flags & HAS_AVX2) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (638:9): [True: 1, False: 0]
  ------------------
  639|      1|    cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_c;
  640|      1|    if (flags & HAS_SSE4_1) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (640:9): [True: 1, False: 0]
  ------------------
  641|      1|    if (flags & HAS_AVX2) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (641:9): [True: 1, False: 0]
  ------------------
  642|      1|    cdef_filter_16_0 = cdef_filter_16_0_c;
  643|      1|    if (flags & HAS_SSE4_1) cdef_filter_16_0 = cdef_filter_16_0_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (643:9): [True: 1, False: 0]
  ------------------
  644|      1|    if (flags & HAS_AVX2) cdef_filter_16_0 = cdef_filter_16_0_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (644:9): [True: 1, False: 0]
  ------------------
  645|      1|    cdef_filter_16_1 = cdef_filter_16_1_c;
  646|      1|    if (flags & HAS_SSE4_1) cdef_filter_16_1 = cdef_filter_16_1_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (646:9): [True: 1, False: 0]
  ------------------
  647|      1|    if (flags & HAS_AVX2) cdef_filter_16_1 = cdef_filter_16_1_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (647:9): [True: 1, False: 0]
  ------------------
  648|      1|    cdef_filter_16_2 = cdef_filter_16_2_c;
  649|      1|    if (flags & HAS_SSE4_1) cdef_filter_16_2 = cdef_filter_16_2_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (649:9): [True: 1, False: 0]
  ------------------
  650|      1|    if (flags & HAS_AVX2) cdef_filter_16_2 = cdef_filter_16_2_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (650:9): [True: 1, False: 0]
  ------------------
  651|      1|    cdef_filter_16_3 = cdef_filter_16_3_c;
  652|      1|    if (flags & HAS_SSE4_1) cdef_filter_16_3 = cdef_filter_16_3_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (652:9): [True: 1, False: 0]
  ------------------
  653|      1|    if (flags & HAS_AVX2) cdef_filter_16_3 = cdef_filter_16_3_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (653:9): [True: 1, False: 0]
  ------------------
  654|      1|    cdef_filter_8_0 = cdef_filter_8_0_c;
  655|      1|    if (flags & HAS_SSE4_1) cdef_filter_8_0 = cdef_filter_8_0_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (655:9): [True: 1, False: 0]
  ------------------
  656|      1|    if (flags & HAS_AVX2) cdef_filter_8_0 = cdef_filter_8_0_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (656:9): [True: 1, False: 0]
  ------------------
  657|      1|    cdef_filter_8_1 = cdef_filter_8_1_c;
  658|      1|    if (flags & HAS_SSE4_1) cdef_filter_8_1 = cdef_filter_8_1_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (658:9): [True: 1, False: 0]
  ------------------
  659|      1|    if (flags & HAS_AVX2) cdef_filter_8_1 = cdef_filter_8_1_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (659:9): [True: 1, False: 0]
  ------------------
  660|      1|    cdef_filter_8_2 = cdef_filter_8_2_c;
  661|      1|    if (flags & HAS_SSE4_1) cdef_filter_8_2 = cdef_filter_8_2_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (661:9): [True: 1, False: 0]
  ------------------
  662|      1|    if (flags & HAS_AVX2) cdef_filter_8_2 = cdef_filter_8_2_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (662:9): [True: 1, False: 0]
  ------------------
  663|      1|    cdef_filter_8_3 = cdef_filter_8_3_c;
  664|      1|    if (flags & HAS_SSE4_1) cdef_filter_8_3 = cdef_filter_8_3_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (664:9): [True: 1, False: 0]
  ------------------
  665|      1|    if (flags & HAS_AVX2) cdef_filter_8_3 = cdef_filter_8_3_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (665:9): [True: 1, False: 0]
  ------------------
  666|      1|    cdef_find_dir = cdef_find_dir_c;
  667|      1|    if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (667:9): [True: 1, False: 0]
  ------------------
  668|      1|    if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (668:9): [True: 1, False: 0]
  ------------------
  669|      1|    cdef_find_dir_dual = cdef_find_dir_dual_c;
  670|      1|    if (flags & HAS_SSE4_1) cdef_find_dir_dual = cdef_find_dir_dual_sse4_1;
  ------------------
  |  |  166|      1|#define HAS_SSE4_1 0x20
  ------------------
  |  Branch (670:9): [True: 1, False: 0]
  ------------------
  671|      1|    if (flags & HAS_AVX2) cdef_find_dir_dual = cdef_find_dir_dual_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (671:9): [True: 1, False: 0]
  ------------------
  672|      1|    cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_c;
  673|      1|    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (673:9): [True: 1, False: 0]
  ------------------
  674|      1|    if (flags & HAS_AVX2) cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (674:9): [True: 1, False: 0]
  ------------------
  675|      1|    cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_c;
  676|      1|    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (676:9): [True: 1, False: 0]
  ------------------
  677|      1|    if (flags & HAS_AVX2) cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (677:9): [True: 1, False: 0]
  ------------------
  678|      1|    cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_c;
  679|      1|    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (679:9): [True: 1, False: 0]
  ------------------
  680|      1|    if (flags & HAS_AVX2) cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (680:9): [True: 1, False: 0]
  ------------------
  681|      1|    cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_c;
  682|      1|    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (682:9): [True: 1, False: 0]
  ------------------
  683|      1|    if (flags & HAS_AVX2) cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (683:9): [True: 1, False: 0]
  ------------------
  684|      1|    cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_c;
  685|      1|    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (685:9): [True: 1, False: 0]
  ------------------
  686|      1|    if (flags & HAS_AVX2) cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (686:9): [True: 1, False: 0]
  ------------------
  687|      1|    cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_c;
  688|      1|    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (688:9): [True: 1, False: 0]
  ------------------
  689|      1|    if (flags & HAS_AVX2) cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (689:9): [True: 1, False: 0]
  ------------------
  690|      1|    cfl_get_predict_hbd_fn = cfl_get_predict_hbd_fn_c;
  691|      1|    if (flags & HAS_SSSE3) cfl_get_predict_hbd_fn = cfl_get_predict_hbd_fn_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (691:9): [True: 1, False: 0]
  ------------------
  692|      1|    if (flags & HAS_AVX2) cfl_get_predict_hbd_fn = cfl_get_predict_hbd_fn_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (692:9): [True: 1, False: 0]
  ------------------
  693|      1|    cfl_get_predict_lbd_fn = cfl_get_predict_lbd_fn_c;
  694|      1|    if (flags & HAS_SSSE3) cfl_get_predict_lbd_fn = cfl_get_predict_lbd_fn_ssse3;
  ------------------
  |  |  165|      1|#define HAS_SSSE3 0x10
  ------------------
  |  Branch (694:9): [True: 1, False: 0]
  ------------------
  695|      1|    if (flags & HAS_AVX2) cfl_get_predict_lbd_fn = cfl_get_predict_lbd_fn_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (695:9): [True: 1, False: 0]
  ------------------
  696|      1|    cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_sse2;
  697|      1|    if (flags & HAS_AVX2) cfl_get_subtract_average_fn = cfl_get_subtract_average_fn_avx2;
  ------------------
  |  |  168|      1|#define HAS_AVX2 0x80
  ------------------
  |  Branch (697:9): [True: 1, False: 0]
  ------------------
  698|      1|}